#include #include #include #include #include #include #include #include #include #include #include namespace lazy_html { // Ensures the given cleanup function is executed when the guard goes // out of scope. class ScopeGuard { public: ScopeGuard(std::function fun) : fun(fun), active(true) {} ~ScopeGuard() { if (active) { fun(); } } void deactivate() { active = false; } private: std::function fun; bool active; }; namespace atoms { auto ElixirLazyHTML = fine::Atom("Elixir.LazyHTML"); auto comment = fine::Atom("comment"); auto resource = fine::Atom("resource"); } // namespace atoms struct DocumentRef { lxb_html_document_t *document; DocumentRef(lxb_html_document_t *document) : document(document) {} ~DocumentRef() { lxb_html_document_destroy(this->document); } }; struct LazyHTML { std::shared_ptr document_ref; std::vector nodes; bool from_selector; LazyHTML(std::shared_ptr document_ref, std::vector nodes, bool from_selector) : document_ref(document_ref), nodes(nodes), from_selector(from_selector) { } }; FINE_RESOURCE(LazyHTML); struct ExLazyHTML { fine::ResourcePtr resource; ExLazyHTML() {} ExLazyHTML(fine::ResourcePtr resource) : resource(resource) {} static constexpr auto module = &atoms::ElixirLazyHTML; static constexpr auto fields() { return std::make_tuple( std::make_tuple(&ExLazyHTML::resource, &atoms::resource)); } }; ERL_NIF_TERM make_new_binary(ErlNifEnv *env, size_t size, const unsigned char *data) { ERL_NIF_TERM term; auto term_data = enif_make_new_binary(env, size, &term); memcpy(term_data, data, size); return term; } ExLazyHTML from_document(ErlNifEnv *env, ErlNifBinary html) { auto document = lxb_html_document_create(); if (document == NULL) { throw std::runtime_error("failed to create document"); } auto document_guard = ScopeGuard([&]() { lxb_html_document_destroy(document); }); auto status = lxb_html_document_parse(document, html.data, html.size); if (status != LXB_STATUS_OK) { throw std::runtime_error("failed to parse html document"); } auto document_ref = std::make_shared(document); document_guard.deactivate(); auto nodes = std::vector(); for (auto node = lxb_dom_node_first_child(lxb_dom_interface_node(document)); node != NULL; node = lxb_dom_node_next(node)) { nodes.push_back(node); } return ExLazyHTML(fine::make_resource(document_ref, nodes, false)); } FINE_NIF(from_document, ERL_NIF_DIRTY_JOB_CPU_BOUND); ExLazyHTML from_fragment(ErlNifEnv *env, ErlNifBinary html) { auto document = lxb_html_document_create(); if (document == NULL) { throw std::runtime_error("failed to create document"); } auto document_guard = ScopeGuard([&]() { lxb_html_document_destroy(document); }); auto context = lxb_dom_document_create_element( &document->dom_document, reinterpret_cast("body"), 4, NULL); auto parse_root = lxb_html_document_parse_fragment(document, context, html.data, html.size); if (parse_root == NULL) { throw std::runtime_error("failed to parse html fragment"); } auto document_ref = std::make_shared(document); document_guard.deactivate(); auto nodes = std::vector(); for (auto node = lxb_dom_node_first_child(parse_root); node != NULL; node = lxb_dom_node_next(node)) { nodes.push_back(node); } return ExLazyHTML(fine::make_resource(document_ref, nodes, false)); } FINE_NIF(from_fragment, ERL_NIF_DIRTY_JOB_CPU_BOUND); void append_escaping(std::string &html, const unsigned char *data, size_t length, size_t unescaped_prefix_size = 0) { size_t offset = 0; size_t size = unescaped_prefix_size; for (size_t i = unescaped_prefix_size; i < length; ++i) { auto ch = data[i]; if (ch == '<') { if (size > 0) { html.append(reinterpret_cast(data + offset), size); } offset = i + 1; size = 0; html.append("<"); } else if (ch == '>') { if (size > 0) { html.append(reinterpret_cast(data + offset), size); } offset = i + 1; size = 0; html.append(">"); } else if (ch == '&') { if (size > 0) { html.append(reinterpret_cast(data + offset), size); } offset = i + 1; size = 0; html.append("&"); } else if (ch == '"') { if (size > 0) { html.append(reinterpret_cast(data + offset), size); } offset = i + 1; size = 0; html.append("""); } else if (ch == '\'') { if (size > 0) { html.append(reinterpret_cast(data + offset), size); } offset = i + 1; size = 0; html.append("'"); } else { size++; } } if (size > 0) { html.append(reinterpret_cast(data + offset), size); } } bool is_noescape_text_node(lxb_dom_node_t *node) { if (node->parent != NULL) { switch (node->parent->local_name) { case LXB_TAG_STYLE: case LXB_TAG_SCRIPT: case LXB_TAG_XMP: case LXB_TAG_IFRAME: case LXB_TAG_NOEMBED: case LXB_TAG_NOFRAMES: case LXB_TAG_PLAINTEXT: return true; } } return false; } size_t leading_whitespace_size(const unsigned char *data, size_t length) { auto size = 0; for (size_t i = 0; i < length; i++) { auto ch = data[i]; if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') { size++; } else { return size; } } return size; } lxb_dom_node_t *template_aware_first_child(lxb_dom_node_t *node) { if (lxb_html_tree_node_is(node, LXB_TAG_TEMPLATE)) { //