Data analysts should be able to use Text.starts_with and Text.ends_with (#3292)

Implements https://www.pivotaltracker.com/story/show/181265900
2024-11-22 03:32:23 +03:00 · 2022-02-23 17:48:33 +01:00 · 2022-02-23 17:48:33 +01:00 · 2ae636f63c
commit 2ae636f63c
parent a13c6e84b5
4 changed files with 256 additions and 39 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -50,6 +50,8 @@
  search.][3285]
 - [Implemented new `Text.take` and `Text.drop` functions, replacing existing
  functions][3287]
 - [Implemented new `Text.starts_with` and `Text.ends_with` functions, replacing
  existing functions][3292]
 [debug-shortcuts]:
  https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -78,6 +80,7 @@
 [3282]: https://github.com/enso-org/enso/pull/3282
 [3285]: https://github.com/enso-org/enso/pull/3285
 [3287]: https://github.com/enso-org/enso/pull/3287
 [3292]: https://github.com/enso-org/enso/pull/3292
 #### Enso Compiler
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@ -711,6 +711,10 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
   Arguments:
   - prefix: The prefix to see if `this` starts with.
   - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
     rules specified in the matcher.
     If a `Regex_Matcher`, the term is used as a regular expression and matched
     using the associated options.
   ! Unicode Equality
     The definition of equality includes Unicode canonicalization. I.e. two
@ -718,12 +722,39 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
     ensures that different ways of expressing the same character in the
     underlying binary representation are considered equal.
-   > Example
+     This however is not always well handled by the regex engine. The behaviour
-     See if the text "Hello" starts with the prefix "hi".
+     is as follows:
-         "Hello".starts_with "hi"
+         'ś' . starts_with 's' == False
-Text.starts_with : Text -> Boolean
+         's\u{301}' . starts_with 's' == False
-Text.starts_with prefix = Text_Utils.starts_with this prefix
+         's\u{301}' . starts_with 'ś' == True
         'ś' . starts_with 's\u{301}' == True
         'ś' . starts_with 's' (Regex_Matcher.new) == True
         's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
         's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
         'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
   > Example
     See if the text "Hello!" starts with the specified prefix.
         "Hello!".starts_with "Hello" == True
         "Hello!".starts_with "hello" == False
         "Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) == True
         "Hello!".starts_with "[a-z]" Regex_Matcher.new == False
         "Hello!".starts_with "[A-Z]" Regex_Matcher.new == True
 Text.starts_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
 Text.starts_with prefix matcher=Text_Matcher.new = case matcher of
    Text_Matcher case_sensitivity -> case case_sensitivity of
        True ->
            this.take (Text_Sub_Range.First prefix.length) == prefix
        Case_Insensitive locale ->
            this.take (Text_Sub_Range.First prefix.length) . equals_ignore_case prefix locale=locale
    Regex_Matcher _ _ _ _ _ ->
        preprocessed_pattern = "\A(?:" + prefix + ")"
        compiled_pattern = here.prepare_regex preprocessed_pattern matcher
        match = compiled_pattern.match this Mode.First
        match.is_nothing.not
 ## ALIAS Check Suffix
@ -731,6 +762,10 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
   Arguments:
   - suffix: The suffix to see if `this` ends with.
   - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
     rules specified in the matcher.
     If a `Regex_Matcher`, the term is used as a regular expression and matched
     using the associated options.
   ! Unicode Equality
     The definition of equality includes Unicode canonicalization. I.e. two
@ -739,10 +774,24 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
     underlying binary representation are considered equal.
   > Example
-     See if the text "Hello" ends with the suffix "low".
+     See if the text "Hello World" ends with the specified suffix.
-         "Hello".ends_with "low"
+
-Text.ends_with : Text -> Boolean
+         "Hello World".ends_with "World" == True
-Text.ends_with suffix = Text_Utils.ends_with this suffix
+         "Hello World".ends_with "world" == False
         "Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) == True
         "Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new == True
 Text.ends_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
 Text.ends_with suffix matcher=Text_Matcher.new = case matcher of
    Text_Matcher case_sensitivity -> case case_sensitivity of
        True ->
            this.take (Text_Sub_Range.Last suffix.length) == suffix
        Case_Insensitive locale ->
            this.take (Text_Sub_Range.Last suffix.length) . equals_ignore_case suffix locale=locale
    Regex_Matcher _ _ _ _ _ ->
        preprocessed_pattern = "(?:" + suffix + ")\z"
        compiled_pattern = here.prepare_regex preprocessed_pattern matcher
        match = compiled_pattern.match this Mode.First
        match.is_nothing.not
 ## ALIAS Contains
@ -801,14 +850,8 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
        True -> Text_Utils.contains this term
        Case_Insensitive locale ->
            Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
-    Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
+    Regex_Matcher _ _ _ _ _ ->
-        case_insensitive = case case_sensitive of
+        compiled_pattern = here.prepare_regex term matcher
            True -> False
            ## TODO [RW] Currently locale is not supported in case-insensitive
               Regex matching. There are plans to revisit it:
               https://www.pivotaltracker.com/story/show/181313576
            Case_Insensitive _ -> True
        compiled_pattern = Regex.compile term case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
        match = compiled_pattern.match this Mode.First
        match.is_nothing.not
@ -997,3 +1040,16 @@ Text.to_lower_case locale=Locale.default =
 Text.to_upper_case : Locale.Locale -> Text
 Text.to_upper_case locale=Locale.default =
    UCharacter.toUpperCase locale.java_locale this
 ## PRIVATE
 prepare_regex : Text -> Regex_Matcher -> Pattern
 prepare_regex pattern regex_matcher = case regex_matcher of
    Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
        case_insensitive = case case_sensitive of
            True -> False
            ## TODO [RW] Currently locale is not supported in case-insensitive
               Regex matching. There are plans to revisit it:
               https://www.pivotaltracker.com/story/show/181313576
            Case_Insensitive _ -> True
        compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
        compiled_pattern
--- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
+++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java
@ -157,28 +157,6 @@ public class Text_Utils {
    return String.valueOf(chars);
  }
  /**
   * Checks whether {@code prefix} is a prefix of {@code str}.
   *
   * @param str the string to check
   * @param prefix the potential prefix
   * @return whether {@code prefix} is a prefix of {@code str}
   */
  public static boolean starts_with(String str, String prefix) {
    return str.startsWith(prefix);
  }
  /**
   * Checks whether {@code suffix} is a suffix of {@code str}.
   *
   * @param str the string to check
   * @param suffix the potential suffix
   * @return whether {@code suffix} is a suffix of {@code str}
   */
  public static boolean ends_with(String str, String suffix) {
    return str.endsWith(suffix);
  }
  /**
   * Compares {@code a} to {@code b} according to the lexicographical order, handling Unicode
   * normalization.
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@ -90,6 +90,7 @@ spec =
            "I" . equals_ignore_case "ı" . should_be_true
            "İ" . equals_ignore_case "i" . should_be_false
            "İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
            "I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
            "Kongressstraße"=="Kongressstrasse" . should_be_false
            "Kongressstraße" . equals_ignore_case "Kongressstrasse" . should_be_true
@ -425,6 +426,185 @@ spec =
            long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
            long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
        Test.specify "should check for starts_with using Unicode normalization" <|
            "Hello".starts_with "He" . should_be_true
            "Ściana".starts_with 'S\u{301}' . should_be_true
            "Ściana".starts_with 'Ś' . should_be_true
            "Ściana".starts_with 'S' . should_be_false
            'S\u{301}ciana'.starts_with 'Ś' . should_be_true
            'S\u{301}ciana'.starts_with 'S\u{301}' . should_be_true
            'S\u{301}ciana'.starts_with 'S' . should_be_false
            "ABC" . starts_with "A" . should_be_true
            "ABC" . starts_with "a" . should_be_false
            "" . starts_with "foo" . should_be_false
            "abc" . starts_with "" . should_be_true
            "" . starts_with "" . should_be_true
            "foo foo foo" . starts_with "foo" . should_be_true
            "Hello!".starts_with "he" . should_be_false
        Test.specify "starts_with should work as shown in the examples" <|
            "Hello!".starts_with "Hello" . should_be_true
            "Hello!".starts_with "hello" . should_be_false
            "Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) . should_be_true
            "Hello!".starts_with "[a-z]" Regex_Matcher.new . should_be_false
            "Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
        Test.specify "should allow for case-insensitive starts_with checks" <|
            "Hello".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
            "Ściana".starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
            "Ściana".starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
            'S\u{301}ciana'.starts_with 'ś' (Text_Matcher Case_Insensitive.new) . should_be_true
            'S\u{301}ciana'.starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
            'S\u{301}ciana'.starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
            "ABC" . starts_with "A" (Text_Matcher Case_Insensitive.new) . should_be_true
            "ABC" . starts_with "a" (Text_Matcher Case_Insensitive.new) . should_be_true
            "ABC" . starts_with "C" (Text_Matcher Case_Insensitive.new) . should_be_false
            "" . starts_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
            "abc" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
            "" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
            "fOo FOO foo" . starts_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
            "Hello!".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
        Test.specify "should allow for Regex starts_with checks" <|
            "Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
            "foobar" . starts_with ".o." Regex_Matcher.new . should_be_true
            "foob" . starts_with ".f." Regex_Matcher.new . should_be_false
            "123 meters and 4 centimeters" . starts_with "[0-9]+" Regex_Matcher.new . should_be_true
            "foo 123" . starts_with "[0-9]+" Regex_Matcher.new . should_be_false
            # Correct non-regex behaviour for reference.
            'ś' . starts_with 's' == False
            's\u{301}' . starts_with 's' == False
            's\u{301}' . starts_with 'ś' == True
            'ś' . starts_with 's\u{301}' == True
            # These two behave as expected.
            's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
            'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
            ## These two are included to document the current behaviour
               (even though ideally, we would want them to return False).
            'ś' . starts_with 's' (Regex_Matcher.new) == True
            's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
            "ściana" . starts_with "ś" Regex_Matcher.new . should_be_true
            "ściana" . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
            's\u{301}ciana' . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
            's\u{301}ciana' . starts_with 'ś' Regex_Matcher.new . should_be_true
            ## These two tests below are disabled due to how regex is handling
               letters with accents. See the tests above for explanation.
            #"ściana" . starts_with "s" Regex_Matcher.new . should_be_false
            # 's\u{301}ciana' . starts_with 's' Regex_Matcher.new . should_be_false
            "fOOBar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
            "faaaar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
            long_text = """
                EOL
                SOL Hmm...
            long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
            long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
            "aaazzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
            "bbbzzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
            "zzzaaa" . starts_with "a|b" Regex_Matcher.new . should_be_false
            "zzzbbb" . starts_with "a|b" Regex_Matcher.new . should_be_false
            "aaazzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
            "bbbzzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
            "zzzaaa" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_false
            "ABC" . starts_with "\AA" Regex_Matcher.new . should_be_true
            "ABC" . starts_with "\AA\z" Regex_Matcher.new . should_be_false
            "foobar" . starts_with "" Regex_Matcher.new . should_be_true
            "" . starts_with "" Regex_Matcher.new . should_be_true
        Test.specify "should check for ends_with using Unicode normalization" <|
            "Hello".ends_with "lo" . should_be_true
            "Hello".ends_with "LO" . should_be_false
            "rzeczywistość".ends_with 'c\u{301}' . should_be_true
            "rzeczywistość".ends_with 'ć' . should_be_true
            "rzeczywistość".ends_with 'c' . should_be_false
            'rzeczywistos\u{301}c\u{301}'.ends_with 'ć' . should_be_true
            'rzeczywistos\u{301}c\u{301}'.ends_with 'c\u{301}' . should_be_true
            'rzeczywistos\u{301}c\u{301}'.ends_with 'c' . should_be_false
            "ABC" . ends_with "C" . should_be_true
            "ABC" . ends_with "c" . should_be_false
            "" . ends_with "foo" . should_be_false
            "abc" . ends_with "" . should_be_true
            "" . ends_with "" . should_be_true
            "foo foo foo" . ends_with "foo" . should_be_true
        Test.specify "ends_with should work as shown in the examples" <|
            "Hello World".ends_with "World" . should_be_true
            "Hello World".ends_with "world" . should_be_false
            "Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) . should_be_true
            "Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new . should_be_true
        Test.specify "should allow for case-insensitive ends_with checks" <|
            "Hello".ends_with "LO" (Text_Matcher Case_Insensitive.new) . should_be_true
            "rzeczywistość".ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
            "rzeczywistość".ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
            'rzeczywistos\u{301}c\u{301}'.ends_with 'Ć' (Text_Matcher Case_Insensitive.new) . should_be_true
            'rzeczywistos\u{301}c\u{301}'.ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
            'rzeczywistos\u{301}c\u{301}'.ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
            "ABC" . ends_with "C" (Text_Matcher Case_Insensitive.new) . should_be_true
            "ABC" . ends_with "c" (Text_Matcher Case_Insensitive.new) . should_be_true
            "ABC" . ends_with "A" (Text_Matcher Case_Insensitive.new) . should_be_false
            "" . ends_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
            "abc" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
            "" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
            "fOo FOO fOo" . ends_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
        Test.specify "should allow for Regex ends_with checks" <|
            "Hello".ends_with "[a-z]" Regex_Matcher.new . should_be_true
            "Hello!".ends_with "[a-z]" Regex_Matcher.new . should_be_false
            "foobar" . ends_with ".o." Regex_Matcher.new . should_be_false
            "foobar" . ends_with ".a." Regex_Matcher.new . should_be_true
            "123 meters and 4 centimeters" . ends_with "[0-9]+" Regex_Matcher.new . should_be_false
            "foo 123" . ends_with "[0-9]+" Regex_Matcher.new . should_be_true
            "rzeczywistość" . ends_with "ć" Regex_Matcher.new . should_be_true
            "rzeczywistość" . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
            'rzeczywistos\u{301}c\u{301}' . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
            'rzeczywistos\u{301}c\u{301}' . ends_with 'ć' Regex_Matcher.new . should_be_true
            "rzeczywistość" . ends_with "c" Regex_Matcher.new . should_be_false
            'rzeczywistos\u{301}c\u{301}' . ends_with 'c' Regex_Matcher.new . should_be_false
            'rzeczywistos\u{301}c\u{301}' . ends_with 'Ć' (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
            "fOOBar" . ends_with ".A." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
            "faaaar" . ends_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
            long_text = """
                Hnnnn EOL
                SOL
            long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
            long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
            "zzzaaa" . ends_with "a|b" Regex_Matcher.new . should_be_true
            "zzzbbb" . ends_with "a|b" Regex_Matcher.new . should_be_true
            "aaazzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
            "bbbzzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
            "zzzaaa" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
            "zzzbbb" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
            "aaazzz" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_false
            "ABC" . ends_with "C\z" Regex_Matcher.new . should_be_true
            "ABC" . ends_with "\AC\z" Regex_Matcher.new . should_be_false
            "foobar" . ends_with "" Regex_Matcher.new . should_be_true
            "" . ends_with "" Regex_Matcher.new . should_be_true
    Test.group "Regex matching" <|
        Test.specify "should be possible on text" <|
            match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First