diff --git a/AK/CircularBuffer.cpp b/AK/CircularBuffer.cpp index 2d80aaaabb2..05e6d7fbf15 100644 --- a/AK/CircularBuffer.cpp +++ b/AK/CircularBuffer.cpp @@ -249,4 +249,107 @@ ErrorOr CircularBuffer::copy_from_seekback(size_t distance, size_t lengt return length - remaining_length; } +ErrorOr> CircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length, Optional const&> distance_hints) const +{ + VERIFY(minimum_length > 0); + + // Clip the maximum length to the amount of data that we actually store. + if (maximum_length > m_used_space) + maximum_length = m_used_space; + + if (maximum_length < minimum_length) + return Vector {}; + + Vector matches; + + if (distance_hints.has_value()) { + // If we have any hints, verify and use those. + for (auto const& distance : distance_hints.value()) { + // TODO: This does not yet support looping repetitions. + if (distance < minimum_length) + continue; + + auto needle_offset = (capacity() + m_reading_head) % capacity(); + auto haystack_offset = (capacity() + m_reading_head - distance) % capacity(); + + for (size_t i = 0; i < minimum_length; i++) { + if (m_buffer[needle_offset] != m_buffer[haystack_offset]) + break; + + needle_offset = (needle_offset + 1) % capacity(); + haystack_offset = (haystack_offset + 1) % capacity(); + + if (i + 1 == minimum_length) + TRY(matches.try_empend(distance, minimum_length)); + } + } + } else { + // Otherwise, use memmem to find the initial matches. + // Note: We have the read head as our reference point, but `next_read_span_with_seekback` isn't aware of that and continues to use the write head. + // Therefore, we need to make sure to slice off the extraneous bytes from the end of the span and shift the returned distances by the correct amount. + size_t haystack_offset_from_start = 0; + Vector haystack; + haystack.append(next_read_span_with_seekback(m_seekback_limit)); + if (haystack[0].size() < m_seekback_limit - used_space()) + haystack.append(next_read_span_with_seekback(m_seekback_limit - haystack[0].size())); + + haystack.last() = haystack.last().trim(haystack.last().size() - used_space()); + + auto needle = next_read_span().trim(minimum_length); + + auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); + while (memmem_match.has_value()) { + auto match_offset = memmem_match.release_value(); + + // Add the match to the list of matches to work with. + TRY(matches.try_empend(m_seekback_limit - used_space() - haystack_offset_from_start - match_offset, minimum_length)); + + auto size_to_discard = match_offset + 1; + + // Trim away the already processed bytes from the haystack. + haystack_offset_from_start += size_to_discard; + while (size_to_discard > 0) { + if (haystack[0].size() < size_to_discard) { + size_to_discard -= haystack[0].size(); + haystack.remove(0); + } else { + haystack[0] = haystack[0].slice(size_to_discard); + break; + } + } + + if (haystack.size() == 0) + break; + + // Try and find the next match. + memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle); + } + } + + // From now on, all matches that we have stored have at least a length of `minimum_length` and they all refer to the same value. + // For the remaining part, we will keep checking the next byte incrementally and keep eliminating matches until we eliminated all of them. + Vector next_matches; + + for (size_t offset = minimum_length; offset < maximum_length; offset++) { + auto needle_data = m_buffer[(capacity() + m_reading_head + offset) % capacity()]; + + for (auto const& match : matches) { + auto haystack_data = m_buffer[(capacity() + m_reading_head - match.distance + offset) % capacity()]; + + if (haystack_data != needle_data) + continue; + + TRY(next_matches.try_empend(match.distance, match.length + 1)); + } + + if (next_matches.size() == 0) + return matches; + + swap(matches, next_matches); + next_matches.clear_with_capacity(); + } + + return matches; +} + } diff --git a/AK/CircularBuffer.h b/AK/CircularBuffer.h index 6b8e7c4be99..56341824517 100644 --- a/AK/CircularBuffer.h +++ b/AK/CircularBuffer.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace AK { @@ -36,6 +37,15 @@ public: ErrorOr copy_from_seekback(size_t distance, size_t length); + struct Match { + size_t distance; + size_t length; + }; + /// This searches the seekback buffer (between read head and limit) for occurrences where it matches the next `length` bytes from the read buffer. + /// Supplying any hints will only consider those distances, in case existing offsets need to be validated. + /// Note that, since we only start searching at the read head, the length between read head and write head is excluded from the distance. + ErrorOr> find_copy_in_seekback(size_t maximum_length, size_t minimum_length = 2, Optional const&> distance_hints = {}) const; + [[nodiscard]] size_t empty_space() const; [[nodiscard]] size_t used_space() const; [[nodiscard]] size_t capacity() const; diff --git a/Tests/AK/TestCircularBuffer.cpp b/Tests/AK/TestCircularBuffer.cpp index 3b6b56caf7d..a86f57a6826 100644 --- a/Tests/AK/TestCircularBuffer.cpp +++ b/Tests/AK/TestCircularBuffer.cpp @@ -329,6 +329,116 @@ TEST_CASE(offset_of_with_until_and_after_wrapping_around) EXPECT_EQ(result.value_or(42), 14ul); } +TEST_CASE(find_copy_in_seekback) +{ + auto haystack = "ABABCABCDAB"sv.bytes(); + auto needle = "ABCD"sv.bytes(); + + // Set up the buffer for testing. + auto buffer = MUST(CircularBuffer::create_empty(haystack.size() + needle.size())); + auto written_haystack_bytes = buffer.write(haystack); + VERIFY(written_haystack_bytes == haystack.size()); + MUST(buffer.discard(haystack.size())); + auto written_needle_bytes = buffer.write(needle); + VERIFY(written_needle_bytes == needle.size()); + + { + // Find the largest matches with a length between 1 and 1 (all "A"). + auto matches = MUST(buffer.find_copy_in_seekback(1, 1)); + EXPECT_EQ(matches.size(), 4ul); + EXPECT_EQ(matches[0].distance, 11ul); + EXPECT_EQ(matches[0].length, 1ul); + EXPECT_EQ(matches[1].distance, 9ul); + EXPECT_EQ(matches[1].length, 1ul); + EXPECT_EQ(matches[2].distance, 6ul); + EXPECT_EQ(matches[2].length, 1ul); + EXPECT_EQ(matches[3].distance, 2ul); + EXPECT_EQ(matches[3].length, 1ul); + } + + { + // Find the largest matches with a length between 1 and 2 (all "AB", everything smaller gets eliminated). + auto matches = MUST(buffer.find_copy_in_seekback(2, 1)); + EXPECT_EQ(matches.size(), 4ul); + EXPECT_EQ(matches[0].distance, 11ul); + EXPECT_EQ(matches[0].length, 2ul); + EXPECT_EQ(matches[1].distance, 9ul); + EXPECT_EQ(matches[1].length, 2ul); + EXPECT_EQ(matches[2].distance, 6ul); + EXPECT_EQ(matches[2].length, 2ul); + EXPECT_EQ(matches[3].distance, 2ul); + EXPECT_EQ(matches[3].length, 2ul); + } + + { + // Find the largest matches with a length between 1 and 3 (all "ABC", everything smaller gets eliminated). + auto matches = MUST(buffer.find_copy_in_seekback(3, 1)); + EXPECT_EQ(matches.size(), 2ul); + EXPECT_EQ(matches[0].distance, 9ul); + EXPECT_EQ(matches[0].length, 3ul); + EXPECT_EQ(matches[1].distance, 6ul); + EXPECT_EQ(matches[1].length, 3ul); + } + + { + // Find the largest matches with a length between 1 and 4 (all "ABCD", everything smaller gets eliminated). + auto matches = MUST(buffer.find_copy_in_seekback(4, 1)); + EXPECT_EQ(matches.size(), 1ul); + EXPECT_EQ(matches[0].distance, 6ul); + EXPECT_EQ(matches[0].length, 4ul); + } + + { + // Find the largest matches with a length between 1 and 5 (all "ABCD", everything smaller gets eliminated, and nothing larger exists). + auto matches = MUST(buffer.find_copy_in_seekback(5, 1)); + EXPECT_EQ(matches.size(), 1ul); + EXPECT_EQ(matches[0].distance, 6ul); + EXPECT_EQ(matches[0].length, 4ul); + } + + { + // Find the largest matches with a length between 4 and 5 (all "ABCD", everything smaller never gets found, nothing larger exists). + auto matches = MUST(buffer.find_copy_in_seekback(5, 4)); + EXPECT_EQ(matches.size(), 1ul); + EXPECT_EQ(matches[0].distance, 6ul); + EXPECT_EQ(matches[0].length, 4ul); + } + + { + // Find the largest matches with a length between 5 and 5 (nothing is found). + auto matches = MUST(buffer.find_copy_in_seekback(5, 5)); + EXPECT_EQ(matches.size(), 0ul); + } + + { + // Find the largest matches with a length between 1 and 2 (selected "AB", everything smaller gets eliminated). + auto matches = MUST(buffer.find_copy_in_seekback(2, 1, Vector { 6ul, 9ul })); + EXPECT_EQ(matches.size(), 2ul); + EXPECT_EQ(matches[0].distance, 6ul); + EXPECT_EQ(matches[0].length, 2ul); + EXPECT_EQ(matches[1].distance, 9ul); + EXPECT_EQ(matches[1].length, 2ul); + } + + { + // Check that we don't find anything for hints before the valid range. + auto matches = MUST(buffer.find_copy_in_seekback(2, 1, Vector { 0ul })); + EXPECT_EQ(matches.size(), 0ul); + } + + { + // Check that we don't find anything for hints after the valid range. + auto matches = MUST(buffer.find_copy_in_seekback(2, 1, Vector { 12ul })); + EXPECT_EQ(matches.size(), 0ul); + } + + { + // Check that we don't find anything for a minimum length beyond the whole buffer size. + auto matches = MUST(buffer.find_copy_in_seekback(12, 13)); + EXPECT_EQ(matches.size(), 0ul); + } +} + BENCHMARK_CASE(looping_copy_from_seekback) { auto circular_buffer = MUST(CircularBuffer::create_empty(16 * MiB));