From 08183f59f2e131f31d16f63ba0591a7ae0679f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 15 Mar 2022 17:11:46 +0100 Subject: [PATCH] Minor fixes for Text (#3340) * Avoid unnecessary copies * Add tests for conversions * Add guidelines for Text tests Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- .../0.0.0-dev/src/Data/Text/Extensions.enso | 6 +-- .../Base/0.0.0-dev/src/Data/Text/Span.enso | 3 ++ test/Tests/src/Data/Text/Span_Spec.enso | 5 +++ test/Tests/src/Data/Text/Utils_Spec.enso | 2 +- test/Tests/src/Data/Text_Spec.enso | 38 ++++++++++++++++++- 5 files changed, 49 insertions(+), 5 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index a7e53d5c61..1dfb3fea5f 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -1350,8 +1350,8 @@ Text.location_of_all : Text -> Matcher -> [Span] Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of Text_Matcher case_sensitive -> if term.is_empty then Vector.new (this.length + 1) (ix -> Span (Range ix ix) this) else case case_sensitive of True -> - codepoint_spans = Vector.from_array <| Text_Utils.span_of_all this term - grahpeme_ixes = Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array + codepoint_spans = Vector.Vector <| Text_Utils.span_of_all this term + grahpeme_ixes = Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices this (codepoint_spans.map .start).to_array ## While the codepoint_spans may have different code unit lengths from our term, the `length` counted in grapheme clusters is guaranteed to be the same. @@ -1360,7 +1360,7 @@ Text.location_of_all term="" matcher=Text_Matcher.new = case matcher of end = start+offset Span (Range start end) this Case_Insensitive locale -> - grapheme_spans = Vector.from_array <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale + grapheme_spans = Vector.Vector <| Text_Utils.span_of_all_case_insensitive this term locale.java_locale grapheme_spans.map grapheme_span-> Span (Range grapheme_span.start grapheme_span.end) this Regex_Matcher _ _ _ _ _ -> diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso index f357d13971..2c77dfd713 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso @@ -167,3 +167,6 @@ range_to_char_indices text range = start_index = iterator.next start end_index = iterator.next (end - start) Range start_index end_index + +Span.from (that:Utf_16_Span) = that.to_grapheme_span +Utf_16_Span.from (that:Span) = that.to_utf_16_span diff --git a/test/Tests/src/Data/Text/Span_Spec.enso b/test/Tests/src/Data/Text/Span_Spec.enso index 2de0ac5096..6aaa7c5193 100644 --- a/test/Tests/src/Data/Text/Span_Spec.enso +++ b/test/Tests/src/Data/Text/Span_Spec.enso @@ -34,4 +34,9 @@ spec = Test.group "Text.Span" <| Utf_16_Span (Range 0 3) text . to_grapheme_span . should_equal (Span (Range 0 2) text) Utf_16_Span (Range 0 2) text . to_grapheme_span . should_equal (Span (Range 0 1) text) + Test.specify "should be able to use the conversions" <| + text = 'ae\u{301}fz' + Utf_16_Span.from (Span (Range 1 3) text) . should_equal (Utf_16_Span (Range 1 4) text) + Span.from (Utf_16_Span (Range 2 4) text) . should_equal (Span (Range 1 3) text) + main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/Data/Text/Utils_Spec.enso b/test/Tests/src/Data/Text/Utils_Spec.enso index b15d7ced51..11c4e41334 100644 --- a/test/Tests/src/Data/Text/Utils_Spec.enso +++ b/test/Tests/src/Data/Text/Utils_Spec.enso @@ -36,7 +36,7 @@ spec = Test.specify "should correctly translate a series of codepoint indices to a grapheme indices in a batch" <| translate_indices text ixes = - Vector.from_array <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array + Vector.Vector <| Text_Utils.utf16_indices_to_grapheme_indices text ixes.to_array codepoint_indices = Vector.new text.utf_16.length ix->ix translate_indices text codepoint_indices . should_equal codepoints_to_graphemes diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 63ddb2514a..72d2831f9f 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -16,6 +16,42 @@ type Manual b Manual.to_text = "[[[MyREP " + this.b.to_text + "]]]" +## Specification of operations on the Text type. + + ? Guidelines on proper handling of edge cases in Text tests: + + The following edge cases should be considered: + - Handling of empty arguments. + - Using grapheme-cluster based indexing instead of code unit indexing where + appropriate: this can be tested by adding tests with graphemes that + consist of multiple code units, like 'e\u{301}' or emojis and ensuring + that the offsets are correct. + - Correct handling of Unicode normalization: some graphemes can be + expressed using different combinations of code units. All alternative + representations of the same grapheme should be treated as equivalent, i.e. + equality checks or substring search should work consistently. Interesting + examples are: + - 'e\u{301}' and '\u00E9' (both meaning 'é'), + - reordering of modifiers (although this may not work for all sets), for + example: 'e\u{321}\u{360}' should be equivalent to 'e\u{360}\u{321}'. + - in general 's' should not be treated as a substring of 's\u{301}' since + the latter is a two-codepoint encoding of a single grapheme 'ś' that is + different from 's'. + - Be aware that changing case can change the length of a string (in + extended grapheme clusters), a common example being `ß` becoming `SS` or + `ffi` becoming `FFI`. Case insensitive comparisons must take this into + consideration. Note that due to this, if matching strings case + insensitively, the length of the match can differ from the length of the + term being matched. + - Casing is locale-dependent. The pair of `i - I` is handled differently in + Turkish and Azerbaijani - instead there are two separate pairs: 'İ - i' + and 'I - ı'. + - Handling of out of range indices should be checked. In particular, often + the index `text.length` should still be valid to point just right at the + end of the text. Moreover, negative indices are usually allowed to index + from the back. + - Note that currently the regex-based operations may not handle the edge + cases described above too well. spec = Test.group "Text" <| kshi = '\u0915\u094D\u0937\u093F' @@ -91,7 +127,7 @@ spec = "I" . equals_ignore_case "i" . should_be_true "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true - "I" . equals_ignore_case "ı" (locale = Locale.new "tr") . should_be_true + "I" . equals_ignore_case "ı" (locale = Locale.new "az") . should_be_true "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false "Kongressstraße"=="Kongressstrasse" . should_be_false