LibUnicode: Define case-insensitive string comparison more generically

The only user is currently String::equals_ignoring_case, but LibRegex
will need to do the same case-folded comparison with UTF-32 data. As it
turns out, the comparison works with all Unicode view types without much
fuss.
This commit is contained in:
Timothy Flynn 2023-11-08 10:13:40 -05:00 committed by Tim Flynn
parent 370ea9441c
commit 6070df40f3
Notes: sideshowbarker 2024-07-18 05:01:22 +09:00
3 changed files with 68 additions and 53 deletions

View File

@ -8,6 +8,8 @@
#include <AK/Platform.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/UnicodeUtils.h>
@ -67,6 +69,68 @@ ErrorOr<String> to_unicode_casefold_full(StringView string)
return builder.to_string();
}
template<typename ViewType>
class CasefoldStringComparator {
public:
explicit CasefoldStringComparator(ViewType string)
: m_string(string)
, m_it(m_string.begin())
{
}
bool has_more_data() const
{
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
}
u32 next_code_point()
{
VERIFY(has_more_data());
if (m_casefolded_code_points.is_empty()) {
m_current_code_point = *m_it;
++m_it;
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
}
auto code_point = m_casefolded_code_points[0];
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
return code_point;
}
private:
ViewType m_string;
typename ViewType::Iterator m_it;
u32 m_current_code_point { 0 };
Utf32View m_casefolded_code_points;
};
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
template<typename ViewType>
bool equals_ignoring_case(ViewType lhs, ViewType rhs)
{
// A string X is a caseless match for a string Y if and only if:
// toCasefold(X) = toCasefold(Y)
CasefoldStringComparator lhs_comparator { lhs };
CasefoldStringComparator rhs_comparator { rhs };
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point())
return false;
}
return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data();
}
template bool equals_ignoring_case(Utf8View, Utf8View);
template bool equals_ignoring_case(Utf16View, Utf16View);
template bool equals_ignoring_case(Utf32View, Utf32View);
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }

View File

@ -60,6 +60,9 @@ ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringV
ErrorOr<String> to_unicode_titlecase_full(StringView, Optional<StringView> const& locale = {}, TrailingCodePointTransformation trailing_code_point_transformation = TrailingCodePointTransformation::Lowercase);
ErrorOr<String> to_unicode_casefold_full(StringView);
template<typename ViewType>
bool equals_ignoring_case(ViewType, ViewType);
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);

View File

@ -6,7 +6,6 @@
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/UnicodeUtils.h>
@ -43,60 +42,9 @@ ErrorOr<String> String::to_casefold() const
return builder.to_string();
}
class CasefoldStringComparator {
public:
explicit CasefoldStringComparator(Utf8View string)
: m_string(string)
, m_it(m_string.begin())
{
}
bool has_more_data() const
{
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
}
u32 next_code_point()
{
VERIFY(has_more_data());
if (m_casefolded_code_points.is_empty()) {
m_current_code_point = *m_it;
++m_it;
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
}
auto code_point = m_casefolded_code_points[0];
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
return code_point;
}
private:
Utf8View m_string;
Utf8CodePointIterator m_it;
u32 m_current_code_point { 0 };
Utf32View m_casefolded_code_points;
};
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
bool String::equals_ignoring_case(String const& other) const
{
// A string X is a caseless match for a string Y if and only if:
// toCasefold(X) = toCasefold(Y)
CasefoldStringComparator lhs { code_points() };
CasefoldStringComparator rhs { other.code_points() };
while (lhs.has_more_data() && rhs.has_more_data()) {
if (lhs.next_code_point() != rhs.next_code_point())
return false;
}
return !lhs.has_more_data() && !rhs.has_more_data();
return Unicode::equals_ignoring_case(code_points(), other.code_points());
}
}