mirror of
https://github.com/enso-org/enso.git
synced 2024-11-27 06:22:09 +03:00
Minor fixes for Text (#3340)
* Avoid unnecessary copies * Add tests for conversions * Add guidelines for Text tests Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
This commit is contained in:
parent
5bc9811f6a
commit
08183f59f2
@ -1350,8 +1350,8 @@ Text.location_of_all : Text -> Matcher -> [Span]
|
||||
Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
|
||||
Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
|
||||
True ->
|
||||
codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
|
||||
grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
|
||||
codepoint_spans = Vector.Vector <| Text_Utils.span_of_all this term
|
||||
grahpeme_ixes = Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
|
||||
## While the codepoint_spans may have different code unit lengths
|
||||
from our term, the `length` counted in grapheme clusters is
|
||||
guaranteed to be the same.
|
||||
@ -1360,7 +1360,7 @@ Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
|
||||
end = start+offset
|
||||
Span (Range start end) this
|
||||
Case_Insensitive locale ->
|
||||
grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
|
||||
grapheme_spans = Vector.Vector <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
|
||||
grapheme_spans.map grapheme_span->
|
||||
Span (Range grapheme_span.start grapheme_span.end) this
|
||||
Regex_Matcher _ _ _ _ _ ->
|
||||
|
@ -167,3 +167,6 @@ range_to_char_indices text range =
|
||||
start_index = iterator.next start
|
||||
end_index = iterator.next (end - start)
|
||||
Range start_index end_index
|
||||
|
||||
Span.from (that:Utf_16_Span) = that.to_grapheme_span
|
||||
Utf_16_Span.from (that:Span) = that.to_utf_16_span
|
||||
|
@ -34,4 +34,9 @@ spec = Test.group "Text.Span" <|
|
||||
Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
|
||||
Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
|
||||
|
||||
Test.specify "should be able to use the conversions" <|
|
||||
text = 'ae\u{301}fz'
|
||||
Utf_16_Span.from (Span (Range 1 3) text) . should_equal (Utf_16_Span (Range 1 4) text)
|
||||
Span.from (Utf_16_Span (Range 2 4) text) . should_equal (Span (Range 1 3) text)
|
||||
|
||||
main = Test.Suite.run_main here.spec
|
||||
|
@ -36,7 +36,7 @@ spec =
|
||||
|
||||
Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
|
||||
translate_indices text ixes =
|
||||
Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
|
||||
Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
|
||||
codepoint_indices = Vector.new text.utf_16.length ix->ix
|
||||
translate_indices text codepoint_indices . should_equal codepoints_to_graphemes
|
||||
|
||||
|
@ -16,6 +16,42 @@ type Manual b
|
||||
|
||||
Manual.to_text = "[[[MyREP " + this.b.to_text + "]]]"
|
||||
|
||||
## Specification of operations on the Text type.
|
||||
|
||||
? Guidelines on proper handling of edge cases in Text tests:
|
||||
|
||||
The following edge cases should be considered:
|
||||
- Handling of empty arguments.
|
||||
- Using grapheme-cluster based indexing instead of code unit indexing where
|
||||
appropriate: this can be tested by adding tests with graphemes that
|
||||
consist of multiple code units, like 'e\u{301}' or emojis and ensuring
|
||||
that the offsets are correct.
|
||||
- Correct handling of Unicode normalization: some graphemes can be
|
||||
expressed using different combinations of code units. All alternative
|
||||
representations of the same grapheme should be treated as equivalent, i.e.
|
||||
equality checks or substring search should work consistently. Interesting
|
||||
examples are:
|
||||
- 'e\u{301}' and '\u00E9' (both meaning 'é'),
|
||||
- reordering of modifiers (although this may not work for all sets), for
|
||||
example: 'e\u{321}\u{360}' should be equivalent to 'e\u{360}\u{321}'.
|
||||
- in general 's' should not be treated as a substring of 's\u{301}' since
|
||||
the latter is a two-codepoint encoding of a single grapheme 'ś' that is
|
||||
different from 's'.
|
||||
- Be aware that changing case can change the length of a string (in
|
||||
extended grapheme clusters), a common example being `ß` becoming `SS` or
|
||||
`ffi` becoming `FFI`. Case insensitive comparisons must take this into
|
||||
consideration. Note that due to this, if matching strings case
|
||||
insensitively, the length of the match can differ from the length of the
|
||||
term being matched.
|
||||
- Casing is locale-dependent. The pair of `i - I` is handled differently in
|
||||
Turkish and Azerbaijani - instead there are two separate pairs: 'İ - i'
|
||||
and 'I - ı'.
|
||||
- Handling of out of range indices should be checked. In particular, often
|
||||
the index `text.length` should still be valid to point just right at the
|
||||
end of the text. Moreover, negative indices are usually allowed to index
|
||||
from the back.
|
||||
- Note that currently the regex-based operations may not handle the edge
|
||||
cases described above too well.
|
||||
spec =
|
||||
Test.group "Text" <|
|
||||
kshi = '\u0915\u094D\u0937\u093F'
|
||||
@ -91,7 +127,7 @@ spec =
|
||||
|
||||
"I" . equals_ignore_case "i" . should_be_true
|
||||
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
|
||||
"I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
|
||||
"I" . equals_ignore_case "ı" (locale = Locale.new "az") . should_be_true
|
||||
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
|
||||
|
||||
"Kongressstraße"=="Kongressstrasse" . should_be_false
|
||||
|
Loading…
Reference in New Issue
Block a user