Minor fixes for Text (#3340)

* Avoid unnecessary copies * Add tests for conversions * Add guidelines for Text tests Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2024-11-27 03:43:20 +03:00 · 2022-03-15 17:11:46 +01:00 · 2022-03-15 17:11:46 +01:00 · 08183f59f2
commit 08183f59f2
parent 5bc9811f6a
5 changed files with 49 additions and 5 deletions
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@ -1350,8 +1350,8 @@ Text.location_of_all : Text -> Matcher -> [Span]
 Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
    Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
        True ->
-            codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
-            grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
+            codepoint_spans = Vector.Vector <| Text_Utils.span_of_all this term
+            grahpeme_ixes = Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
            ## While the codepoint_spans may have different code unit lengths
               from our term, the `length` counted in grapheme clusters is
               guaranteed to be the same.
@ -1360,7 +1360,7 @@ Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
                end = start+offset
                Span (Range start end) this
        Case_Insensitive locale ->
-            grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
+            grapheme_spans = Vector.Vector <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
            grapheme_spans.map grapheme_span->
                Span (Range grapheme_span.start grapheme_span.end) this
    Regex_Matcher _ _ _ _ _ ->
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
@ -167,3 +167,6 @@ range_to_char_indices text range =
                start_index = iterator.next start
                end_index = iterator.next (end - start)
                Range start_index end_index
+
+Span.from (that:Utf_16_Span) = that.to_grapheme_span
+Utf_16_Span.from (that:Span) = that.to_utf_16_span
--- a/test/Tests/src/Data/Text/Span_Spec.enso
+++ b/test/Tests/src/Data/Text/Span_Spec.enso
@ -34,4 +34,9 @@ spec = Test.group "Text.Span" <|
        Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
        Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)

+    Test.specify "should be able to use the conversions" <|
+        text = 'ae\u{301}fz'
+        Utf_16_Span.from (Span (Range 1 3) text) . should_equal (Utf_16_Span (Range 1 4) text)
+        Span.from (Utf_16_Span (Range 2 4) text) . should_equal (Span (Range 1 3) text)
+
 main = Test.Suite.run_main here.spec
--- a/test/Tests/src/Data/Text/Utils_Spec.enso
+++ b/test/Tests/src/Data/Text/Utils_Spec.enso
@ -36,7 +36,7 @@ spec =

        Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
            translate_indices text ixes =
-                Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
+                Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
            codepoint_indices = Vector.new text.utf_16.length ix->ix
            translate_indices text codepoint_indices . should_equal codepoints_to_graphemes

--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@ -16,6 +16,42 @@ type Manual b

 Manual.to_text = "[[[MyREP " + this.b.to_text + "]]]"

+## Specification of operations on the Text type.
+
+   ? Guidelines on proper handling of edge cases in Text tests:
+
+     The following edge cases should be considered:
+     - Handling of empty arguments.
+     - Using grapheme-cluster based indexing instead of code unit indexing where
+       appropriate: this can be tested by adding tests with graphemes that
+       consist of multiple code units, like 'e\u{301}' or emojis and ensuring
+       that the offsets are correct.
+     - Correct handling of Unicode normalization: some graphemes can be
+       expressed using different combinations of code units. All alternative
+       representations of the same grapheme should be treated as equivalent, i.e.
+       equality checks or substring search should work consistently. Interesting
+       examples are:
+       - 'e\u{301}' and '\u00E9' (both meaning 'é'),
+       - reordering of modifiers (although this may not work for all sets), for
+         example: 'e\u{321}\u{360}' should be equivalent to 'e\u{360}\u{321}'.
+       - in general 's' should not be treated as a substring of 's\u{301}' since
+         the latter is a two-codepoint encoding of a single grapheme 'ś' that is
+         different from 's'.
+     - Be aware that changing case can change the length of a string (in
+       extended grapheme clusters), a common example being `ß` becoming `SS` or
+       `ﬃ` becoming `FFI`. Case insensitive comparisons must take this into
+       consideration. Note that due to this, if matching strings case
+       insensitively, the length of the match can differ from the length of the
+       term being matched.
+     - Casing is locale-dependent. The pair of `i - I` is handled differently in
+       Turkish and Azerbaijani - instead there are two separate pairs: 'İ - i'
+       and 'I - ı'.
+     - Handling of out of range indices should be checked. In particular, often
+       the index `text.length` should still be valid to point just right at the
+       end of the text. Moreover, negative indices are usually allowed to index
+       from the back.
+     - Note that currently the regex-based operations may not handle the edge
+       cases described above too well.
 spec =
    Test.group "Text" <|
        kshi = '\u0915\u094D\u0937\u093F'
@ -91,7 +127,7 @@ spec =

            "I" . equals_ignore_case "i" . should_be_true
            "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
-            "I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
+            "I" . equals_ignore_case "ı" (locale = Locale.new "az") . should_be_true
            "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false

            "Kongressstraße"=="Kongressstrasse" . should_be_false