mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-08 04:50:08 +03:00
LibUnicode: Define case-insensitive string comparison more generically
The only user is currently String::equals_ignoring_case, but LibRegex will need to do the same case-folded comparison with UTF-32 data. As it turns out, the comparison works with all Unicode view types without much fuss.
This commit is contained in:
parent
370ea9441c
commit
6070df40f3
Notes:
sideshowbarker
2024-07-18 05:01:22 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/6070df40f3 Pull-request: https://github.com/SerenityOS/serenity/pull/21846 Reviewed-by: https://github.com/alimpfard ✅
@ -8,6 +8,8 @@
|
||||
#include <AK/Platform.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/UnicodeUtils.h>
|
||||
@ -67,6 +69,68 @@ ErrorOr<String> to_unicode_casefold_full(StringView string)
|
||||
return builder.to_string();
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
class CasefoldStringComparator {
|
||||
public:
|
||||
explicit CasefoldStringComparator(ViewType string)
|
||||
: m_string(string)
|
||||
, m_it(m_string.begin())
|
||||
{
|
||||
}
|
||||
|
||||
bool has_more_data() const
|
||||
{
|
||||
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
|
||||
}
|
||||
|
||||
u32 next_code_point()
|
||||
{
|
||||
VERIFY(has_more_data());
|
||||
|
||||
if (m_casefolded_code_points.is_empty()) {
|
||||
m_current_code_point = *m_it;
|
||||
++m_it;
|
||||
|
||||
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
|
||||
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
|
||||
}
|
||||
|
||||
auto code_point = m_casefolded_code_points[0];
|
||||
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
|
||||
|
||||
return code_point;
|
||||
}
|
||||
|
||||
private:
|
||||
ViewType m_string;
|
||||
typename ViewType::Iterator m_it;
|
||||
|
||||
u32 m_current_code_point { 0 };
|
||||
Utf32View m_casefolded_code_points;
|
||||
};
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
|
||||
template<typename ViewType>
|
||||
bool equals_ignoring_case(ViewType lhs, ViewType rhs)
|
||||
{
|
||||
// A string X is a caseless match for a string Y if and only if:
|
||||
// toCasefold(X) = toCasefold(Y)
|
||||
|
||||
CasefoldStringComparator lhs_comparator { lhs };
|
||||
CasefoldStringComparator rhs_comparator { rhs };
|
||||
|
||||
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
|
||||
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point())
|
||||
return false;
|
||||
}
|
||||
|
||||
return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data();
|
||||
}
|
||||
|
||||
template bool equals_ignoring_case(Utf8View, Utf8View);
|
||||
template bool equals_ignoring_case(Utf16View, Utf16View);
|
||||
template bool equals_ignoring_case(Utf32View, Utf32View);
|
||||
|
||||
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
|
||||
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
|
||||
|
@ -60,6 +60,9 @@ ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringV
|
||||
ErrorOr<String> to_unicode_titlecase_full(StringView, Optional<StringView> const& locale = {}, TrailingCodePointTransformation trailing_code_point_transformation = TrailingCodePointTransformation::Lowercase);
|
||||
ErrorOr<String> to_unicode_casefold_full(StringView);
|
||||
|
||||
template<typename ViewType>
|
||||
bool equals_ignoring_case(ViewType, ViewType);
|
||||
|
||||
Optional<GeneralCategory> general_category_from_string(StringView);
|
||||
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
|
||||
|
||||
|
@ -6,7 +6,6 @@
|
||||
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
#include <LibUnicode/UnicodeUtils.h>
|
||||
@ -43,60 +42,9 @@ ErrorOr<String> String::to_casefold() const
|
||||
return builder.to_string();
|
||||
}
|
||||
|
||||
class CasefoldStringComparator {
|
||||
public:
|
||||
explicit CasefoldStringComparator(Utf8View string)
|
||||
: m_string(string)
|
||||
, m_it(m_string.begin())
|
||||
{
|
||||
}
|
||||
|
||||
bool has_more_data() const
|
||||
{
|
||||
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
|
||||
}
|
||||
|
||||
u32 next_code_point()
|
||||
{
|
||||
VERIFY(has_more_data());
|
||||
|
||||
if (m_casefolded_code_points.is_empty()) {
|
||||
m_current_code_point = *m_it;
|
||||
++m_it;
|
||||
|
||||
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
|
||||
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
|
||||
}
|
||||
|
||||
auto code_point = m_casefolded_code_points[0];
|
||||
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
|
||||
|
||||
return code_point;
|
||||
}
|
||||
|
||||
private:
|
||||
Utf8View m_string;
|
||||
Utf8CodePointIterator m_it;
|
||||
|
||||
u32 m_current_code_point { 0 };
|
||||
Utf32View m_casefolded_code_points;
|
||||
};
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
|
||||
bool String::equals_ignoring_case(String const& other) const
|
||||
{
|
||||
// A string X is a caseless match for a string Y if and only if:
|
||||
// toCasefold(X) = toCasefold(Y)
|
||||
|
||||
CasefoldStringComparator lhs { code_points() };
|
||||
CasefoldStringComparator rhs { other.code_points() };
|
||||
|
||||
while (lhs.has_more_data() && rhs.has_more_data()) {
|
||||
if (lhs.next_code_point() != rhs.next_code_point())
|
||||
return false;
|
||||
}
|
||||
|
||||
return !lhs.has_more_data() && !rhs.has_more_data();
|
||||
return Unicode::equals_ignoring_case(code_points(), other.code_points());
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user