Minor fixes for Text (#3340)

* Avoid unnecessary copies

* Add tests for conversions

* Add guidelines for Text tests

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
This commit is contained in:
Radosław Waśko 2022-03-15 17:11:46 +01:00 committed by GitHub
parent 5bc9811f6a
commit 08183f59f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 49 additions and 5 deletions

View File

@ -1350,8 +1350,8 @@ Text.location_of_all : Text -> Matcher -> [Span]
Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
True ->
codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
codepoint_spans = Vector.Vector <| Text_Utils.span_of_all this term
grahpeme_ixes = Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
## While the codepoint_spans may have different code unit lengths
from our term, the `length` counted in grapheme clusters is
guaranteed to be the same.
@ -1360,7 +1360,7 @@ Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
end = start+offset
Span (Range start end) this
Case_Insensitive locale ->
grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
grapheme_spans = Vector.Vector <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
grapheme_spans.map grapheme_span->
Span (Range grapheme_span.start grapheme_span.end) this
Regex_Matcher _ _ _ _ _ ->

View File

@ -167,3 +167,6 @@ range_to_char_indices text range =
start_index = iterator.next start
end_index = iterator.next (end - start)
Range start_index end_index
Span.from (that:Utf_16_Span) = that.to_grapheme_span
Utf_16_Span.from (that:Span) = that.to_utf_16_span

View File

@ -34,4 +34,9 @@ spec = Test.group "Text.Span" <|
Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
Test.specify "should be able to use the conversions" <|
text = 'ae\u{301}fz'
Utf_16_Span.from (Span (Range 1 3) text) . should_equal (Utf_16_Span (Range 1 4) text)
Span.from (Utf_16_Span (Range 2 4) text) . should_equal (Span (Range 1 3) text)
main = Test.Suite.run_main here.spec

View File

@ -36,7 +36,7 @@ spec =
Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
translate_indices text ixes =
Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
codepoint_indices = Vector.new text.utf_16.length ix->ix
translate_indices text codepoint_indices . should_equal codepoints_to_graphemes

View File

@ -16,6 +16,42 @@ type Manual b
Manual.to_text = "[[[MyREP " + this.b.to_text + "]]]"
## Specification of operations on the Text type.
? Guidelines on proper handling of edge cases in Text tests:
The following edge cases should be considered:
- Handling of empty arguments.
- Using grapheme-cluster based indexing instead of code unit indexing where
appropriate: this can be tested by adding tests with graphemes that
consist of multiple code units, like 'e\u{301}' or emojis and ensuring
that the offsets are correct.
- Correct handling of Unicode normalization: some graphemes can be
expressed using different combinations of code units. All alternative
representations of the same grapheme should be treated as equivalent, i.e.
equality checks or substring search should work consistently. Interesting
examples are:
- 'e\u{301}' and '\u00E9' (both meaning 'é'),
- reordering of modifiers (although this may not work for all sets), for
example: 'e\u{321}\u{360}' should be equivalent to 'e\u{360}\u{321}'.
- in general 's' should not be treated as a substring of 's\u{301}' since
the latter is a two-codepoint encoding of a single grapheme 'ś' that is
different from 's'.
- Be aware that changing case can change the length of a string (in
extended grapheme clusters), a common example being `ß` becoming `SS` or
`ffi` becoming `FFI`. Case insensitive comparisons must take this into
consideration. Note that due to this, if matching strings case
insensitively, the length of the match can differ from the length of the
term being matched.
- Casing is locale-dependent. The pair of `i - I` is handled differently in
Turkish and Azerbaijani - instead there are two separate pairs: 'İ - i'
and 'I - ı'.
- Handling of out of range indices should be checked. In particular, often
the index `text.length` should still be valid to point just right at the
end of the text. Moreover, negative indices are usually allowed to index
from the back.
- Note that currently the regex-based operations may not handle the edge
cases described above too well.
spec =
Test.group "Text" <|
kshi = '\u0915\u094D\u0937\u093F'
@ -91,7 +127,7 @@ spec =
"I" . equals_ignore_case "i" . should_be_true
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
"I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
"I" . equals_ignore_case "ı" (locale = Locale.new "az") . should_be_true
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
"Kongressstraße"=="Kongressstrasse" . should_be_false