From ec4d29849dc1d0357c73690722aea1a7802dd0bc Mon Sep 17 00:00:00 2001 From: Tim Ledbetter Date: Thu, 13 Jun 2024 19:43:08 +0100 Subject: [PATCH] LibWeb: Support finding text split across multiple text nodes Previously, the find in page function would fail to find text which was split across multiple text nodes. For example, given the following markup: `WHF` the query `WHF` would previously fail to be matched. This is done by traversing all of the document's text nodes - constructing a complete string to query against and keeping track of the locations where that string is split across multiple nodes. --- Userland/Libraries/LibWeb/DOM/Document.cpp | 70 ++++++++++++++++------ 1 file changed, 53 insertions(+), 17 deletions(-) diff --git a/Userland/Libraries/LibWeb/DOM/Document.cpp b/Userland/Libraries/LibWeb/DOM/Document.cpp index 3343662ea97..b9b4bec9a4c 100644 --- a/Userland/Libraries/LibWeb/DOM/Document.cpp +++ b/Userland/Libraries/LibWeb/DOM/Document.cpp @@ -5113,29 +5113,65 @@ Vector> Document::find_matching_text(String const& query, if (!document_element() || !document_element()->layout_node()) return {}; - Vector> matches; + struct TextPositionNode { + DOM::Text& dom_node; + size_t start_offset { 0 }; + }; + + StringBuilder builder; + Vector text_positions; + size_t current_start_position = 0; + String current_node_text; document_element()->layout_node()->for_each_in_inclusive_subtree_of_type([&](auto const& text_node) { - auto const& text = text_node.text_for_rendering(); - size_t offset = 0; - while (true) { - auto match_index = case_sensitivity == CaseSensitivity::CaseInsensitive - ? text.find_byte_offset_ignoring_case(query, offset) - : text.find_byte_offset(query, offset); - if (!match_index.has_value()) - break; - - auto range = create_range(); - auto& dom_node = const_cast(text_node.dom_node()); - (void)range->set_start(dom_node, match_index.value()); - (void)range->set_end(dom_node, match_index.value() + query.code_points().length()); - - matches.append(range); - offset = match_index.value() + 1; + auto& dom_node = const_cast(text_node.dom_node()); + if (text_positions.is_empty()) { + text_positions.empend(dom_node); + } else { + current_start_position += current_node_text.bytes_as_string_view().length(); + text_positions.empend(dom_node, current_start_position); } + current_node_text = text_node.text_for_rendering(); + builder.append(current_node_text); return TraversalDecision::Continue; }); + if (text_positions.is_empty()) + return {}; + + size_t offset = 0; + auto* match_start_position = &text_positions[0]; + auto text = builder.to_string_without_validation(); + Vector> matches; + while (true) { + auto match_index = case_sensitivity == CaseSensitivity::CaseInsensitive + ? text.find_byte_offset_ignoring_case(query, offset) + : text.find_byte_offset(query, offset); + if (!match_index.has_value()) + break; + + size_t i = 0; + for (; i < text_positions.size() && match_index.value() > text_positions[i].start_offset; ++i) + match_start_position = &text_positions[i]; + + auto range = create_range(); + auto start_position = match_index.value() - match_start_position->start_offset; + auto& start_dom_node = match_start_position->dom_node; + (void)range->set_start(start_dom_node, start_position); + + auto* match_end_position = match_start_position; + for (; i < text_positions.size() && match_index.value() + query.bytes_as_string_view().length() > text_positions[i].start_offset; ++i) + match_end_position = &text_positions[i]; + + auto& end_dom_node = match_end_position->dom_node; + auto end_position = match_index.value() - match_end_position->start_offset + query.bytes_as_string_view().length(); + (void)range->set_end(end_dom_node, end_position); + + matches.append(range); + offset = match_index.value() + query.bytes_as_string_view().length() + 1; + match_start_position = match_end_position; + } + return matches; }