Data analysts should be able to use Text.starts_with and Text.ends_with (#3292)

Implements https://www.pivotaltracker.com/story/show/181265900
This commit is contained in:
Radosław Waśko 2022-02-23 17:48:33 +01:00 committed by GitHub
parent a13c6e84b5
commit 2ae636f63c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 256 additions and 39 deletions

View File

@ -50,6 +50,8 @@
search.][3285]
- [Implemented new `Text.take` and `Text.drop` functions, replacing existing
functions][3287]
- [Implemented new `Text.starts_with` and `Text.ends_with` functions, replacing
existing functions][3292]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -78,6 +80,7 @@
[3282]: https://github.com/enso-org/enso/pull/3282
[3285]: https://github.com/enso-org/enso/pull/3285
[3287]: https://github.com/enso-org/enso/pull/3287
[3292]: https://github.com/enso-org/enso/pull/3292
#### Enso Compiler

View File

@ -711,6 +711,10 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
Arguments:
- prefix: The prefix to see if `this` starts with.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher.
If a `Regex_Matcher`, the term is used as a regular expression and matched
using the associated options.
! Unicode Equality
The definition of equality includes Unicode canonicalization. I.e. two
@ -718,12 +722,39 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
ensures that different ways of expressing the same character in the
underlying binary representation are considered equal.
> Example
See if the text "Hello" starts with the prefix "hi".
This however is not always well handled by the regex engine. The behaviour
is as follows:
"Hello".starts_with "hi"
Text.starts_with : Text -> Boolean
Text.starts_with prefix = Text_Utils.starts_with this prefix
'ś' . starts_with 's' == False
's\u{301}' . starts_with 's' == False
's\u{301}' . starts_with 'ś' == True
'ś' . starts_with 's\u{301}' == True
'ś' . starts_with 's' (Regex_Matcher.new) == True
's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
> Example
See if the text "Hello!" starts with the specified prefix.
"Hello!".starts_with "Hello" == True
"Hello!".starts_with "hello" == False
"Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) == True
"Hello!".starts_with "[a-z]" Regex_Matcher.new == False
"Hello!".starts_with "[A-Z]" Regex_Matcher.new == True
Text.starts_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
Text.starts_with prefix matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitivity -> case case_sensitivity of
True ->
this.take (Text_Sub_Range.First prefix.length) == prefix
Case_Insensitive locale ->
this.take (Text_Sub_Range.First prefix.length) . equals_ignore_case prefix locale=locale
Regex_Matcher _ _ _ _ _ ->
preprocessed_pattern = "\A(?:" + prefix + ")"
compiled_pattern = here.prepare_regex preprocessed_pattern matcher
match = compiled_pattern.match this Mode.First
match.is_nothing.not
## ALIAS Check Suffix
@ -731,6 +762,10 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
Arguments:
- suffix: The suffix to see if `this` ends with.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher.
If a `Regex_Matcher`, the term is used as a regular expression and matched
using the associated options.
! Unicode Equality
The definition of equality includes Unicode canonicalization. I.e. two
@ -739,10 +774,24 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
underlying binary representation are considered equal.
> Example
See if the text "Hello" ends with the suffix "low".
"Hello".ends_with "low"
Text.ends_with : Text -> Boolean
Text.ends_with suffix = Text_Utils.ends_with this suffix
See if the text "Hello World" ends with the specified suffix.
"Hello World".ends_with "World" == True
"Hello World".ends_with "world" == False
"Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) == True
"Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new == True
Text.ends_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
Text.ends_with suffix matcher=Text_Matcher.new = case matcher of
Text_Matcher case_sensitivity -> case case_sensitivity of
True ->
this.take (Text_Sub_Range.Last suffix.length) == suffix
Case_Insensitive locale ->
this.take (Text_Sub_Range.Last suffix.length) . equals_ignore_case suffix locale=locale
Regex_Matcher _ _ _ _ _ ->
preprocessed_pattern = "(?:" + suffix + ")\z"
compiled_pattern = here.prepare_regex preprocessed_pattern matcher
match = compiled_pattern.match this Mode.First
match.is_nothing.not
## ALIAS Contains
@ -801,14 +850,8 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
True -> Text_Utils.contains this term
Case_Insensitive locale ->
Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
case_insensitive = case case_sensitive of
True -> False
## TODO [RW] Currently locale is not supported in case-insensitive
Regex matching. There are plans to revisit it:
https://www.pivotaltracker.com/story/show/181313576
Case_Insensitive _ -> True
compiled_pattern = Regex.compile term case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
Regex_Matcher _ _ _ _ _ ->
compiled_pattern = here.prepare_regex term matcher
match = compiled_pattern.match this Mode.First
match.is_nothing.not
@ -997,3 +1040,16 @@ Text.to_lower_case locale=Locale.default =
Text.to_upper_case : Locale.Locale -> Text
Text.to_upper_case locale=Locale.default =
UCharacter.toUpperCase locale.java_locale this
## PRIVATE
prepare_regex : Text -> Regex_Matcher -> Pattern
prepare_regex pattern regex_matcher = case regex_matcher of
Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
case_insensitive = case case_sensitive of
True -> False
## TODO [RW] Currently locale is not supported in case-insensitive
Regex matching. There are plans to revisit it:
https://www.pivotaltracker.com/story/show/181313576
Case_Insensitive _ -> True
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
compiled_pattern

View File

@ -157,28 +157,6 @@ public class Text_Utils {
return String.valueOf(chars);
}
/**
* Checks whether {@code prefix} is a prefix of {@code str}.
*
* @param str the string to check
* @param prefix the potential prefix
* @return whether {@code prefix} is a prefix of {@code str}
*/
public static boolean starts_with(String str, String prefix) {
return str.startsWith(prefix);
}
/**
* Checks whether {@code suffix} is a suffix of {@code str}.
*
* @param str the string to check
* @param suffix the potential suffix
* @return whether {@code suffix} is a suffix of {@code str}
*/
public static boolean ends_with(String str, String suffix) {
return str.endsWith(suffix);
}
/**
* Compares {@code a} to {@code b} according to the lexicographical order, handling Unicode
* normalization.

View File

@ -90,6 +90,7 @@ spec =
"I" . equals_ignore_case "ı" . should_be_true
"İ" . equals_ignore_case "i" . should_be_false
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
"Kongressstraße"=="Kongressstrasse" . should_be_false
"Kongressstraße" . equals_ignore_case "Kongressstrasse" . should_be_true
@ -425,6 +426,185 @@ spec =
long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
Test.specify "should check for starts_with using Unicode normalization" <|
"Hello".starts_with "He" . should_be_true
"Ściana".starts_with 'S\u{301}' . should_be_true
"Ściana".starts_with 'Ś' . should_be_true
"Ściana".starts_with 'S' . should_be_false
'S\u{301}ciana'.starts_with 'Ś' . should_be_true
'S\u{301}ciana'.starts_with 'S\u{301}' . should_be_true
'S\u{301}ciana'.starts_with 'S' . should_be_false
"ABC" . starts_with "A" . should_be_true
"ABC" . starts_with "a" . should_be_false
"" . starts_with "foo" . should_be_false
"abc" . starts_with "" . should_be_true
"" . starts_with "" . should_be_true
"foo foo foo" . starts_with "foo" . should_be_true
"Hello!".starts_with "he" . should_be_false
Test.specify "starts_with should work as shown in the examples" <|
"Hello!".starts_with "Hello" . should_be_true
"Hello!".starts_with "hello" . should_be_false
"Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) . should_be_true
"Hello!".starts_with "[a-z]" Regex_Matcher.new . should_be_false
"Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
Test.specify "should allow for case-insensitive starts_with checks" <|
"Hello".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
"Ściana".starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
"Ściana".starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
'S\u{301}ciana'.starts_with 'ś' (Text_Matcher Case_Insensitive.new) . should_be_true
'S\u{301}ciana'.starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
'S\u{301}ciana'.starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
"ABC" . starts_with "A" (Text_Matcher Case_Insensitive.new) . should_be_true
"ABC" . starts_with "a" (Text_Matcher Case_Insensitive.new) . should_be_true
"ABC" . starts_with "C" (Text_Matcher Case_Insensitive.new) . should_be_false
"" . starts_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
"abc" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
"" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
"fOo FOO foo" . starts_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
"Hello!".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
Test.specify "should allow for Regex starts_with checks" <|
"Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
"foobar" . starts_with ".o." Regex_Matcher.new . should_be_true
"foob" . starts_with ".f." Regex_Matcher.new . should_be_false
"123 meters and 4 centimeters" . starts_with "[0-9]+" Regex_Matcher.new . should_be_true
"foo 123" . starts_with "[0-9]+" Regex_Matcher.new . should_be_false
# Correct non-regex behaviour for reference.
'ś' . starts_with 's' == False
's\u{301}' . starts_with 's' == False
's\u{301}' . starts_with 'ś' == True
'ś' . starts_with 's\u{301}' == True
# These two behave as expected.
's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
## These two are included to document the current behaviour
(even though ideally, we would want them to return False).
'ś' . starts_with 's' (Regex_Matcher.new) == True
's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
"ściana" . starts_with "ś" Regex_Matcher.new . should_be_true
"ściana" . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
's\u{301}ciana' . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
's\u{301}ciana' . starts_with 'ś' Regex_Matcher.new . should_be_true
## These two tests below are disabled due to how regex is handling
letters with accents. See the tests above for explanation.
#"ściana" . starts_with "s" Regex_Matcher.new . should_be_false
# 's\u{301}ciana' . starts_with 's' Regex_Matcher.new . should_be_false
"fOOBar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
"faaaar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
long_text = """
EOL
SOL Hmm...
long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
"aaazzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
"bbbzzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
"zzzaaa" . starts_with "a|b" Regex_Matcher.new . should_be_false
"zzzbbb" . starts_with "a|b" Regex_Matcher.new . should_be_false
"aaazzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
"bbbzzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
"zzzaaa" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_false
"ABC" . starts_with "\AA" Regex_Matcher.new . should_be_true
"ABC" . starts_with "\AA\z" Regex_Matcher.new . should_be_false
"foobar" . starts_with "" Regex_Matcher.new . should_be_true
"" . starts_with "" Regex_Matcher.new . should_be_true
Test.specify "should check for ends_with using Unicode normalization" <|
"Hello".ends_with "lo" . should_be_true
"Hello".ends_with "LO" . should_be_false
"rzeczywistość".ends_with 'c\u{301}' . should_be_true
"rzeczywistość".ends_with 'ć' . should_be_true
"rzeczywistość".ends_with 'c' . should_be_false
'rzeczywistos\u{301}c\u{301}'.ends_with 'ć' . should_be_true
'rzeczywistos\u{301}c\u{301}'.ends_with 'c\u{301}' . should_be_true
'rzeczywistos\u{301}c\u{301}'.ends_with 'c' . should_be_false
"ABC" . ends_with "C" . should_be_true
"ABC" . ends_with "c" . should_be_false
"" . ends_with "foo" . should_be_false
"abc" . ends_with "" . should_be_true
"" . ends_with "" . should_be_true
"foo foo foo" . ends_with "foo" . should_be_true
Test.specify "ends_with should work as shown in the examples" <|
"Hello World".ends_with "World" . should_be_true
"Hello World".ends_with "world" . should_be_false
"Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) . should_be_true
"Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new . should_be_true
Test.specify "should allow for case-insensitive ends_with checks" <|
"Hello".ends_with "LO" (Text_Matcher Case_Insensitive.new) . should_be_true
"rzeczywistość".ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
"rzeczywistość".ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
'rzeczywistos\u{301}c\u{301}'.ends_with 'Ć' (Text_Matcher Case_Insensitive.new) . should_be_true
'rzeczywistos\u{301}c\u{301}'.ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
'rzeczywistos\u{301}c\u{301}'.ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
"ABC" . ends_with "C" (Text_Matcher Case_Insensitive.new) . should_be_true
"ABC" . ends_with "c" (Text_Matcher Case_Insensitive.new) . should_be_true
"ABC" . ends_with "A" (Text_Matcher Case_Insensitive.new) . should_be_false
"" . ends_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
"abc" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
"" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
"fOo FOO fOo" . ends_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
Test.specify "should allow for Regex ends_with checks" <|
"Hello".ends_with "[a-z]" Regex_Matcher.new . should_be_true
"Hello!".ends_with "[a-z]" Regex_Matcher.new . should_be_false
"foobar" . ends_with ".o." Regex_Matcher.new . should_be_false
"foobar" . ends_with ".a." Regex_Matcher.new . should_be_true
"123 meters and 4 centimeters" . ends_with "[0-9]+" Regex_Matcher.new . should_be_false
"foo 123" . ends_with "[0-9]+" Regex_Matcher.new . should_be_true
"rzeczywistość" . ends_with "ć" Regex_Matcher.new . should_be_true
"rzeczywistość" . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
'rzeczywistos\u{301}c\u{301}' . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
'rzeczywistos\u{301}c\u{301}' . ends_with 'ć' Regex_Matcher.new . should_be_true
"rzeczywistość" . ends_with "c" Regex_Matcher.new . should_be_false
'rzeczywistos\u{301}c\u{301}' . ends_with 'c' Regex_Matcher.new . should_be_false
'rzeczywistos\u{301}c\u{301}' . ends_with 'Ć' (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
"fOOBar" . ends_with ".A." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
"faaaar" . ends_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
long_text = """
Hnnnn EOL
SOL
long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
"zzzaaa" . ends_with "a|b" Regex_Matcher.new . should_be_true
"zzzbbb" . ends_with "a|b" Regex_Matcher.new . should_be_true
"aaazzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
"bbbzzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
"zzzaaa" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
"zzzbbb" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
"aaazzz" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_false
"ABC" . ends_with "C\z" Regex_Matcher.new . should_be_true
"ABC" . ends_with "\AC\z" Regex_Matcher.new . should_be_false
"foobar" . ends_with "" Regex_Matcher.new . should_be_true
"" . ends_with "" Regex_Matcher.new . should_be_true
Test.group "Regex matching" <|
Test.specify "should be possible on text" <|
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First