Data analysts should be able to use Text.starts_with and Text.ends_with (#3292)

Implements https://www.pivotaltracker.com/story/show/181265900
2024-12-31 07:02:26 +03:00 · 2022-02-23 17:48:33 +01:00 · 2022-02-23 17:48:33 +01:00 · 2ae636f63c
commit 2ae636f63c
parent a13c6e84b5
4 changed files with 256 additions and 39 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -50,6 +50,8 @@
  search.][3285]
 - [Implemented new `Text.take` and `Text.drop` functions, replacing existing
  functions][3287]
+- [Implemented new `Text.starts_with` and `Text.ends_with` functions, replacing
+  existing functions][3292]

 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -78,6 +80,7 @@
 [3282]: https://github.com/enso-org/enso/pull/3282
 [3285]: https://github.com/enso-org/enso/pull/3285
 [3287]: https://github.com/enso-org/enso/pull/3287
+[3292]: https://github.com/enso-org/enso/pull/3292

 #### Enso Compiler

--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@ -711,6 +711,10 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array

   Arguments:
   - prefix: The prefix to see if `this` starts with.
+   - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
+     rules specified in the matcher.
+     If a `Regex_Matcher`, the term is used as a regular expression and matched
+     using the associated options.

   ! Unicode Equality
     The definition of equality includes Unicode canonicalization. I.e. two
@ -718,12 +722,39 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
     ensures that different ways of expressing the same character in the
     underlying binary representation are considered equal.

-   > Example
-     See if the text "Hello" starts with the prefix "hi".
+     This however is not always well handled by the regex engine. The behaviour
+     is as follows:

-         "Hello".starts_with "hi"
-Text.starts_with : Text -> Boolean
-Text.starts_with prefix = Text_Utils.starts_with this prefix
+         'ś' . starts_with 's' == False
+         's\u{301}' . starts_with 's' == False
+         's\u{301}' . starts_with 'ś' == True
+         'ś' . starts_with 's\u{301}' == True
+
+         'ś' . starts_with 's' (Regex_Matcher.new) == True
+         's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
+         's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
+         'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
+
+   > Example
+     See if the text "Hello!" starts with the specified prefix.
+
+         "Hello!".starts_with "Hello" == True
+         "Hello!".starts_with "hello" == False
+         "Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) == True
+         "Hello!".starts_with "[a-z]" Regex_Matcher.new == False
+         "Hello!".starts_with "[A-Z]" Regex_Matcher.new == True
+Text.starts_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
+Text.starts_with prefix matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitivity -> case case_sensitivity of
+        True ->
+            this.take (Text_Sub_Range.First prefix.length) == prefix
+        Case_Insensitive locale ->
+            this.take (Text_Sub_Range.First prefix.length) . equals_ignore_case prefix locale=locale
+    Regex_Matcher _ _ _ _ _ ->
+        preprocessed_pattern = "\A(?:" + prefix + ")"
+        compiled_pattern = here.prepare_regex preprocessed_pattern matcher
+        match = compiled_pattern.match this Mode.First
+        match.is_nothing.not

 ## ALIAS Check Suffix

@ -731,6 +762,10 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix

   Arguments:
   - suffix: The suffix to see if `this` ends with.
+   - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
+     rules specified in the matcher.
+     If a `Regex_Matcher`, the term is used as a regular expression and matched
+     using the associated options.

   ! Unicode Equality
     The definition of equality includes Unicode canonicalization. I.e. two
@ -739,10 +774,24 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
     underlying binary representation are considered equal.

   > Example
-     See if the text "Hello" ends with the suffix "low".
-         "Hello".ends_with "low"
-Text.ends_with : Text -> Boolean
-Text.ends_with suffix = Text_Utils.ends_with this suffix
+     See if the text "Hello World" ends with the specified suffix.
+
+         "Hello World".ends_with "World" == True
+         "Hello World".ends_with "world" == False
+         "Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) == True
+         "Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new == True
+Text.ends_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
+Text.ends_with suffix matcher=Text_Matcher.new = case matcher of
+    Text_Matcher case_sensitivity -> case case_sensitivity of
+        True ->
+            this.take (Text_Sub_Range.Last suffix.length) == suffix
+        Case_Insensitive locale ->
+            this.take (Text_Sub_Range.Last suffix.length) . equals_ignore_case suffix locale=locale
+    Regex_Matcher _ _ _ _ _ ->
+        preprocessed_pattern = "(?:" + suffix + ")\z"
+        compiled_pattern = here.prepare_regex preprocessed_pattern matcher
+        match = compiled_pattern.match this Mode.First
+        match.is_nothing.not

 ## ALIAS Contains

@ -801,14 +850,8 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
        True -> Text_Utils.contains this term
        Case_Insensitive locale ->
            Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
-    Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
-        case_insensitive = case case_sensitive of
-            True -> False
-            ## TODO [RW] Currently locale is not supported in case-insensitive
-               Regex matching. There are plans to revisit it:
-               https://www.pivotaltracker.com/story/show/181313576
-            Case_Insensitive _ -> True
-        compiled_pattern = Regex.compile term case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
+    Regex_Matcher _ _ _ _ _ ->
+        compiled_pattern = here.prepare_regex term matcher
        match = compiled_pattern.match this Mode.First
        match.is_nothing.not

@ -997,3 +1040,16 @@ Text.to_lower_case locale=Locale.default =
 Text.to_upper_case : Locale.Locale -> Text
 Text.to_upper_case locale=Locale.default =
    UCharacter.toUpperCase locale.java_locale this
+
+## PRIVATE
+prepare_regex : Text -> Regex_Matcher -> Pattern
+prepare_regex pattern regex_matcher = case regex_matcher of
+    Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
+        case_insensitive = case case_sensitive of
+            True -> False
+            ## TODO [RW] Currently locale is not supported in case-insensitive
+               Regex matching. There are plans to revisit it:
+               https://www.pivotaltracker.com/story/show/181313576
+            Case_Insensitive _ -> True
+        compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
+        compiled_pattern
--- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@ -157,28 +157,6 @@ public class Text_Utils {
    return String.valueOf(chars);
  }

-  /**
-   * Checks whether {@code prefix} is a prefix of {@code str}.
-   *
-   * @param str the string to check
-   * @param prefix the potential prefix
-   * @return whether {@code prefix} is a prefix of {@code str}
-   */
-  public static boolean starts_with(String str, String prefix) {
-    return str.startsWith(prefix);
-  }
-
-  /**
-   * Checks whether {@code suffix} is a suffix of {@code str}.
-   *
-   * @param str the string to check
-   * @param suffix the potential suffix
-   * @return whether {@code suffix} is a suffix of {@code str}
-   */
-  public static boolean ends_with(String str, String suffix) {
-    return str.endsWith(suffix);
-  }
-
  /**
   * Compares {@code a} to {@code b} according to the lexicographical order, handling Unicode
   * normalization.
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@ -90,6 +90,7 @@ spec =
            "I" . equals_ignore_case "ı" . should_be_true
            "İ" . equals_ignore_case "i" . should_be_false
            "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
+            "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false

            "Kongressstraße"=="Kongressstrasse" . should_be_false
            "Kongressstraße" . equals_ignore_case "Kongressstrasse" . should_be_true
@ -425,6 +426,185 @@ spec =
            long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
            long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false

+        Test.specify "should check for starts_with using Unicode normalization" <|
+            "Hello".starts_with "He" . should_be_true
+
+            "Ściana".starts_with 'S\u{301}' . should_be_true
+            "Ściana".starts_with 'Ś' . should_be_true
+            "Ściana".starts_with 'S' . should_be_false
+            'S\u{301}ciana'.starts_with 'Ś' . should_be_true
+            'S\u{301}ciana'.starts_with 'S\u{301}' . should_be_true
+            'S\u{301}ciana'.starts_with 'S' . should_be_false
+
+            "ABC" . starts_with "A" . should_be_true
+            "ABC" . starts_with "a" . should_be_false
+            "" . starts_with "foo" . should_be_false
+            "abc" . starts_with "" . should_be_true
+            "" . starts_with "" . should_be_true
+            "foo foo foo" . starts_with "foo" . should_be_true
+
+            "Hello!".starts_with "he" . should_be_false
+
+        Test.specify "starts_with should work as shown in the examples" <|
+            "Hello!".starts_with "Hello" . should_be_true
+            "Hello!".starts_with "hello" . should_be_false
+            "Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "Hello!".starts_with "[a-z]" Regex_Matcher.new . should_be_false
+            "Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
+
+        Test.specify "should allow for case-insensitive starts_with checks" <|
+            "Hello".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+            "Ściana".starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            "Ściana".starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
+            'S\u{301}ciana'.starts_with 'ś' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'S\u{301}ciana'.starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'S\u{301}ciana'.starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
+
+            "ABC" . starts_with "A" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . starts_with "a" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . starts_with "C" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "" . starts_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "abc" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "fOo FOO foo" . starts_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+            "Hello!".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+        Test.specify "should allow for Regex starts_with checks" <|
+            "Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
+            "foobar" . starts_with ".o." Regex_Matcher.new . should_be_true
+            "foob" . starts_with ".f." Regex_Matcher.new . should_be_false
+
+            "123 meters and 4 centimeters" . starts_with "[0-9]+" Regex_Matcher.new . should_be_true
+            "foo 123" . starts_with "[0-9]+" Regex_Matcher.new . should_be_false
+
+            # Correct non-regex behaviour for reference.
+            'ś' . starts_with 's' == False
+            's\u{301}' . starts_with 's' == False
+            's\u{301}' . starts_with 'ś' == True
+            'ś' . starts_with 's\u{301}' == True
+
+            # These two behave as expected.
+            's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
+            'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
+
+            ## These two are included to document the current behaviour
+               (even though ideally, we would want them to return False).
+            'ś' . starts_with 's' (Regex_Matcher.new) == True
+            's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
+
+            "ściana" . starts_with "ś" Regex_Matcher.new . should_be_true
+            "ściana" . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
+            's\u{301}ciana' . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
+            's\u{301}ciana' . starts_with 'ś' Regex_Matcher.new . should_be_true
+
+            ## These two tests below are disabled due to how regex is handling
+               letters with accents. See the tests above for explanation.
+            #"ściana" . starts_with "s" Regex_Matcher.new . should_be_false
+            # 's\u{301}ciana' . starts_with 's' Regex_Matcher.new . should_be_false
+
+            "fOOBar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
+            "faaaar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
+
+            long_text = """
+                EOL
+                SOL Hmm...
+            long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
+            long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
+
+            "aaazzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
+            "bbbzzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
+            "zzzaaa" . starts_with "a|b" Regex_Matcher.new . should_be_false
+            "zzzbbb" . starts_with "a|b" Regex_Matcher.new . should_be_false
+            "aaazzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "bbbzzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "zzzaaa" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_false
+            "ABC" . starts_with "\AA" Regex_Matcher.new . should_be_true
+            "ABC" . starts_with "\AA\z" Regex_Matcher.new . should_be_false
+            "foobar" . starts_with "" Regex_Matcher.new . should_be_true
+            "" . starts_with "" Regex_Matcher.new . should_be_true
+
+        Test.specify "should check for ends_with using Unicode normalization" <|
+            "Hello".ends_with "lo" . should_be_true
+            "Hello".ends_with "LO" . should_be_false
+
+            "rzeczywistość".ends_with 'c\u{301}' . should_be_true
+            "rzeczywistość".ends_with 'ć' . should_be_true
+            "rzeczywistość".ends_with 'c' . should_be_false
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'ć' . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'c\u{301}' . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'c' . should_be_false
+
+            "ABC" . ends_with "C" . should_be_true
+            "ABC" . ends_with "c" . should_be_false
+            "" . ends_with "foo" . should_be_false
+            "abc" . ends_with "" . should_be_true
+            "" . ends_with "" . should_be_true
+            "foo foo foo" . ends_with "foo" . should_be_true
+
+        Test.specify "ends_with should work as shown in the examples" <|
+            "Hello World".ends_with "World" . should_be_true
+            "Hello World".ends_with "world" . should_be_false
+            "Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new . should_be_true
+
+        Test.specify "should allow for case-insensitive ends_with checks" <|
+            "Hello".ends_with "LO" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+            "rzeczywistość".ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            "rzeczywistość".ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'Ć' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
+            'rzeczywistos\u{301}c\u{301}'.ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
+
+            "ABC" . ends_with "C" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . ends_with "c" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "ABC" . ends_with "A" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "" . ends_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
+            "abc" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
+            "fOo FOO fOo" . ends_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
+
+        Test.specify "should allow for Regex ends_with checks" <|
+            "Hello".ends_with "[a-z]" Regex_Matcher.new . should_be_true
+            "Hello!".ends_with "[a-z]" Regex_Matcher.new . should_be_false
+
+            "foobar" . ends_with ".o." Regex_Matcher.new . should_be_false
+            "foobar" . ends_with ".a." Regex_Matcher.new . should_be_true
+
+            "123 meters and 4 centimeters" . ends_with "[0-9]+" Regex_Matcher.new . should_be_false
+            "foo 123" . ends_with "[0-9]+" Regex_Matcher.new . should_be_true
+
+            "rzeczywistość" . ends_with "ć" Regex_Matcher.new . should_be_true
+            "rzeczywistość" . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'ć' Regex_Matcher.new . should_be_true
+            "rzeczywistość" . ends_with "c" Regex_Matcher.new . should_be_false
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'c' Regex_Matcher.new . should_be_false
+
+            'rzeczywistos\u{301}c\u{301}' . ends_with 'Ć' (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
+            "fOOBar" . ends_with ".A." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
+            "faaaar" . ends_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
+
+            long_text = """
+                Hnnnn EOL
+                SOL
+            long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
+            long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
+
+            "zzzaaa" . ends_with "a|b" Regex_Matcher.new . should_be_true
+            "zzzbbb" . ends_with "a|b" Regex_Matcher.new . should_be_true
+            "aaazzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
+            "bbbzzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
+            "zzzaaa" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "zzzbbb" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
+            "aaazzz" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_false
+            "ABC" . ends_with "C\z" Regex_Matcher.new . should_be_true
+            "ABC" . ends_with "\AC\z" Regex_Matcher.new . should_be_false
+            "foobar" . ends_with "" Regex_Matcher.new . should_be_true
+            "" . ends_with "" Regex_Matcher.new . should_be_true
+
    Test.group "Regex matching" <|
        Test.specify "should be possible on text" <|
            match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First