From 08183f59f2e131f31d16f63ba0591a7ae0679f28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= <radoslaw.wasko@enso.org>
Date: Tue, 15 Mar 2022 17:11:46 +0100
Subject: [PATCH] Minor fixes for Text (#3340)

* Avoid unnecessary copies

* Add tests for conversions

* Add guidelines for Text tests

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 .../0.0.0-dev/src/Data/Text/Extensions.enso   |  6 +--
 .../Base/0.0.0-dev/src/Data/Text/Span.enso    |  3 ++
 test/Tests/src/Data/Text/Span_Spec.enso       |  5 +++
 test/Tests/src/Data/Text/Utils_Spec.enso      |  2 +-
 test/Tests/src/Data/Text_Spec.enso            | 38 ++++++++++++++++++-
 5 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
index a7e53d5c61..1dfb3fea5f 100644
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@@ -1350,8 +1350,8 @@ Text.location_of_all : Text -> Matcher -> [Span]
 Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
     Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of
         True ->
-            codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term
-            grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
+            codepoint_spans = Vector.Vector <| Text_Utils.span_of_all this term
+            grahpeme_ixes = Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array
             ## While the codepoint_spans may have different code unit lengths
                from our term, the `length` counted in grapheme clusters is
                guaranteed to be the same.
@@ -1360,7 +1360,7 @@ Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of
                 end = start+offset
                 Span (Range start end) this
         Case_Insensitive locale ->
-            grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
+            grapheme_spans = Vector.Vector <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale
             grapheme_spans.map grapheme_span->
                 Span (Range grapheme_span.start grapheme_span.end) this
     Regex_Matcher _ _ _ _ _ ->
diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
index f357d13971..2c77dfd713 100644
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
@@ -167,3 +167,6 @@ range_to_char_indices text range =
                 start_index = iterator.next start
                 end_index = iterator.next (end - start)
                 Range start_index end_index
+
+Span.from (that:Utf_16_Span) = that.to_grapheme_span
+Utf_16_Span.from (that:Span) = that.to_utf_16_span
diff --git a/test/Tests/src/Data/Text/Span_Spec.enso b/test/Tests/src/Data/Text/Span_Spec.enso
index 2de0ac5096..6aaa7c5193 100644
--- a/test/Tests/src/Data/Text/Span_Spec.enso
+++ b/test/Tests/src/Data/Text/Span_Spec.enso
@@ -34,4 +34,9 @@ spec = Test.group "Text.Span" <|
         Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text)
         Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text)
 
+    Test.specify "should be able to use the conversions" <|
+        text = 'ae\u{301}fz'
+        Utf_16_Span.from (Span (Range 1 3) text) . should_equal (Utf_16_Span (Range 1 4) text)
+        Span.from (Utf_16_Span (Range 2 4) text) . should_equal (Span (Range 1 3) text)
+
 main = Test.Suite.run_main here.spec
diff --git a/test/Tests/src/Data/Text/Utils_Spec.enso b/test/Tests/src/Data/Text/Utils_Spec.enso
index b15d7ced51..11c4e41334 100644
--- a/test/Tests/src/Data/Text/Utils_Spec.enso
+++ b/test/Tests/src/Data/Text/Utils_Spec.enso
@@ -36,7 +36,7 @@ spec =
 
         Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <|
             translate_indices text ixes =
-                Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
+                Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array
             codepoint_indices = Vector.new text.utf_16.length ix->ix
             translate_indices text codepoint_indices . should_equal codepoints_to_graphemes
 
diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso
index 63ddb2514a..72d2831f9f 100644
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@@ -16,6 +16,42 @@ type Manual b
 
 Manual.to_text = "[[[MyREP " + this.b.to_text + "]]]"
 
+## Specification of operations on the Text type.
+
+   ? Guidelines on proper handling of edge cases in Text tests:
+
+     The following edge cases should be considered:
+     - Handling of empty arguments.
+     - Using grapheme-cluster based indexing instead of code unit indexing where
+       appropriate: this can be tested by adding tests with graphemes that
+       consist of multiple code units, like 'e\u{301}' or emojis and ensuring
+       that the offsets are correct.
+     - Correct handling of Unicode normalization: some graphemes can be
+       expressed using different combinations of code units. All alternative
+       representations of the same grapheme should be treated as equivalent, i.e.
+       equality checks or substring search should work consistently. Interesting
+       examples are:
+       - 'e\u{301}' and '\u00E9' (both meaning 'é'),
+       - reordering of modifiers (although this may not work for all sets), for
+         example: 'e\u{321}\u{360}' should be equivalent to 'e\u{360}\u{321}'.
+       - in general 's' should not be treated as a substring of 's\u{301}' since
+         the latter is a two-codepoint encoding of a single grapheme 'ś' that is
+         different from 's'.
+     - Be aware that changing case can change the length of a string (in
+       extended grapheme clusters), a common example being `ß` becoming `SS` or
+       `ﬃ` becoming `FFI`. Case insensitive comparisons must take this into
+       consideration. Note that due to this, if matching strings case
+       insensitively, the length of the match can differ from the length of the
+       term being matched.
+     - Casing is locale-dependent. The pair of `i - I` is handled differently in
+       Turkish and Azerbaijani - instead there are two separate pairs: 'İ - i'
+       and 'I - ı'.
+     - Handling of out of range indices should be checked. In particular, often
+       the index `text.length` should still be valid to point just right at the
+       end of the text. Moreover, negative indices are usually allowed to index
+       from the back.
+     - Note that currently the regex-based operations may not handle the edge
+       cases described above too well.
 spec =
     Test.group "Text" <|
         kshi = '\u0915\u094D\u0937\u093F'
@@ -91,7 +127,7 @@ spec =
 
             "I" . equals_ignore_case "i" . should_be_true
             "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
-            "I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true
+            "I" . equals_ignore_case "ı" (locale = Locale.new "az") . should_be_true
             "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
 
             "Kongressstraße"=="Kongressstrasse" . should_be_false