mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 03:32:23 +03:00
Data analysts should be able to use Text.starts_with
and Text.ends_with
(#3292)
Implements https://www.pivotaltracker.com/story/show/181265900
This commit is contained in:
parent
a13c6e84b5
commit
2ae636f63c
@ -50,6 +50,8 @@
|
||||
search.][3285]
|
||||
- [Implemented new `Text.take` and `Text.drop` functions, replacing existing
|
||||
functions][3287]
|
||||
- [Implemented new `Text.starts_with` and `Text.ends_with` functions, replacing
|
||||
existing functions][3292]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -78,6 +80,7 @@
|
||||
[3282]: https://github.com/enso-org/enso/pull/3282
|
||||
[3285]: https://github.com/enso-org/enso/pull/3285
|
||||
[3287]: https://github.com/enso-org/enso/pull/3287
|
||||
[3292]: https://github.com/enso-org/enso/pull/3292
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -711,6 +711,10 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
|
||||
|
||||
Arguments:
|
||||
- prefix: The prefix to see if `this` starts with.
|
||||
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
|
||||
rules specified in the matcher.
|
||||
If a `Regex_Matcher`, the term is used as a regular expression and matched
|
||||
using the associated options.
|
||||
|
||||
! Unicode Equality
|
||||
The definition of equality includes Unicode canonicalization. I.e. two
|
||||
@ -718,12 +722,39 @@ Text.from_codepoints codepoints = Text_Utils.from_codepoints codepoints.to_array
|
||||
ensures that different ways of expressing the same character in the
|
||||
underlying binary representation are considered equal.
|
||||
|
||||
> Example
|
||||
See if the text "Hello" starts with the prefix "hi".
|
||||
This however is not always well handled by the regex engine. The behaviour
|
||||
is as follows:
|
||||
|
||||
"Hello".starts_with "hi"
|
||||
Text.starts_with : Text -> Boolean
|
||||
Text.starts_with prefix = Text_Utils.starts_with this prefix
|
||||
'ś' . starts_with 's' == False
|
||||
's\u{301}' . starts_with 's' == False
|
||||
's\u{301}' . starts_with 'ś' == True
|
||||
'ś' . starts_with 's\u{301}' == True
|
||||
|
||||
'ś' . starts_with 's' (Regex_Matcher.new) == True
|
||||
's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
|
||||
's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
|
||||
'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
|
||||
|
||||
> Example
|
||||
See if the text "Hello!" starts with the specified prefix.
|
||||
|
||||
"Hello!".starts_with "Hello" == True
|
||||
"Hello!".starts_with "hello" == False
|
||||
"Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) == True
|
||||
"Hello!".starts_with "[a-z]" Regex_Matcher.new == False
|
||||
"Hello!".starts_with "[A-Z]" Regex_Matcher.new == True
|
||||
Text.starts_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
|
||||
Text.starts_with prefix matcher=Text_Matcher.new = case matcher of
|
||||
Text_Matcher case_sensitivity -> case case_sensitivity of
|
||||
True ->
|
||||
this.take (Text_Sub_Range.First prefix.length) == prefix
|
||||
Case_Insensitive locale ->
|
||||
this.take (Text_Sub_Range.First prefix.length) . equals_ignore_case prefix locale=locale
|
||||
Regex_Matcher _ _ _ _ _ ->
|
||||
preprocessed_pattern = "\A(?:" + prefix + ")"
|
||||
compiled_pattern = here.prepare_regex preprocessed_pattern matcher
|
||||
match = compiled_pattern.match this Mode.First
|
||||
match.is_nothing.not
|
||||
|
||||
## ALIAS Check Suffix
|
||||
|
||||
@ -731,6 +762,10 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
|
||||
|
||||
Arguments:
|
||||
- suffix: The suffix to see if `this` ends with.
|
||||
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
|
||||
rules specified in the matcher.
|
||||
If a `Regex_Matcher`, the term is used as a regular expression and matched
|
||||
using the associated options.
|
||||
|
||||
! Unicode Equality
|
||||
The definition of equality includes Unicode canonicalization. I.e. two
|
||||
@ -739,10 +774,24 @@ Text.starts_with prefix = Text_Utils.starts_with this prefix
|
||||
underlying binary representation are considered equal.
|
||||
|
||||
> Example
|
||||
See if the text "Hello" ends with the suffix "low".
|
||||
"Hello".ends_with "low"
|
||||
Text.ends_with : Text -> Boolean
|
||||
Text.ends_with suffix = Text_Utils.ends_with this suffix
|
||||
See if the text "Hello World" ends with the specified suffix.
|
||||
|
||||
"Hello World".ends_with "World" == True
|
||||
"Hello World".ends_with "world" == False
|
||||
"Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) == True
|
||||
"Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new == True
|
||||
Text.ends_with : Text -> (Text_Matcher | Regex_Matcher) -> Boolean
|
||||
Text.ends_with suffix matcher=Text_Matcher.new = case matcher of
|
||||
Text_Matcher case_sensitivity -> case case_sensitivity of
|
||||
True ->
|
||||
this.take (Text_Sub_Range.Last suffix.length) == suffix
|
||||
Case_Insensitive locale ->
|
||||
this.take (Text_Sub_Range.Last suffix.length) . equals_ignore_case suffix locale=locale
|
||||
Regex_Matcher _ _ _ _ _ ->
|
||||
preprocessed_pattern = "(?:" + suffix + ")\z"
|
||||
compiled_pattern = here.prepare_regex preprocessed_pattern matcher
|
||||
match = compiled_pattern.match this Mode.First
|
||||
match.is_nothing.not
|
||||
|
||||
## ALIAS Contains
|
||||
|
||||
@ -801,14 +850,8 @@ Text.contains term="" matcher=Text_Matcher.new = case matcher of
|
||||
True -> Text_Utils.contains this term
|
||||
Case_Insensitive locale ->
|
||||
Text_Utils.contains (this.to_case_insensitive_key locale) (term.to_case_insensitive_key locale)
|
||||
Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
|
||||
case_insensitive = case case_sensitive of
|
||||
True -> False
|
||||
## TODO [RW] Currently locale is not supported in case-insensitive
|
||||
Regex matching. There are plans to revisit it:
|
||||
https://www.pivotaltracker.com/story/show/181313576
|
||||
Case_Insensitive _ -> True
|
||||
compiled_pattern = Regex.compile term case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
|
||||
Regex_Matcher _ _ _ _ _ ->
|
||||
compiled_pattern = here.prepare_regex term matcher
|
||||
match = compiled_pattern.match this Mode.First
|
||||
match.is_nothing.not
|
||||
|
||||
@ -997,3 +1040,16 @@ Text.to_lower_case locale=Locale.default =
|
||||
Text.to_upper_case : Locale.Locale -> Text
|
||||
Text.to_upper_case locale=Locale.default =
|
||||
UCharacter.toUpperCase locale.java_locale this
|
||||
|
||||
## PRIVATE
|
||||
prepare_regex : Text -> Regex_Matcher -> Pattern
|
||||
prepare_regex pattern regex_matcher = case regex_matcher of
|
||||
Regex_Matcher case_sensitive multiline match_ascii dot_matches_newline comments ->
|
||||
case_insensitive = case case_sensitive of
|
||||
True -> False
|
||||
## TODO [RW] Currently locale is not supported in case-insensitive
|
||||
Regex matching. There are plans to revisit it:
|
||||
https://www.pivotaltracker.com/story/show/181313576
|
||||
Case_Insensitive _ -> True
|
||||
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive match_ascii=match_ascii dot_matches_newline=dot_matches_newline multiline=multiline comments=comments
|
||||
compiled_pattern
|
||||
|
@ -157,28 +157,6 @@ public class Text_Utils {
|
||||
return String.valueOf(chars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether {@code prefix} is a prefix of {@code str}.
|
||||
*
|
||||
* @param str the string to check
|
||||
* @param prefix the potential prefix
|
||||
* @return whether {@code prefix} is a prefix of {@code str}
|
||||
*/
|
||||
public static boolean starts_with(String str, String prefix) {
|
||||
return str.startsWith(prefix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether {@code suffix} is a suffix of {@code str}.
|
||||
*
|
||||
* @param str the string to check
|
||||
* @param suffix the potential suffix
|
||||
* @return whether {@code suffix} is a suffix of {@code str}
|
||||
*/
|
||||
public static boolean ends_with(String str, String suffix) {
|
||||
return str.endsWith(suffix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares {@code a} to {@code b} according to the lexicographical order, handling Unicode
|
||||
* normalization.
|
||||
|
@ -90,6 +90,7 @@ spec =
|
||||
"I" . equals_ignore_case "ı" . should_be_true
|
||||
"İ" . equals_ignore_case "i" . should_be_false
|
||||
"İ" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_true
|
||||
"I" . equals_ignore_case "i" (locale = Locale.new "tr") . should_be_false
|
||||
|
||||
"Kongressstraße"=="Kongressstrasse" . should_be_false
|
||||
"Kongressstraße" . equals_ignore_case "Kongressstrasse" . should_be_true
|
||||
@ -425,6 +426,185 @@ spec =
|
||||
long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
|
||||
long_text . contains "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
|
||||
|
||||
Test.specify "should check for starts_with using Unicode normalization" <|
|
||||
"Hello".starts_with "He" . should_be_true
|
||||
|
||||
"Ściana".starts_with 'S\u{301}' . should_be_true
|
||||
"Ściana".starts_with 'Ś' . should_be_true
|
||||
"Ściana".starts_with 'S' . should_be_false
|
||||
'S\u{301}ciana'.starts_with 'Ś' . should_be_true
|
||||
'S\u{301}ciana'.starts_with 'S\u{301}' . should_be_true
|
||||
'S\u{301}ciana'.starts_with 'S' . should_be_false
|
||||
|
||||
"ABC" . starts_with "A" . should_be_true
|
||||
"ABC" . starts_with "a" . should_be_false
|
||||
"" . starts_with "foo" . should_be_false
|
||||
"abc" . starts_with "" . should_be_true
|
||||
"" . starts_with "" . should_be_true
|
||||
"foo foo foo" . starts_with "foo" . should_be_true
|
||||
|
||||
"Hello!".starts_with "he" . should_be_false
|
||||
|
||||
Test.specify "starts_with should work as shown in the examples" <|
|
||||
"Hello!".starts_with "Hello" . should_be_true
|
||||
"Hello!".starts_with "hello" . should_be_false
|
||||
"Hello!".starts_with "hello" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"Hello!".starts_with "[a-z]" Regex_Matcher.new . should_be_false
|
||||
"Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
|
||||
|
||||
Test.specify "should allow for case-insensitive starts_with checks" <|
|
||||
"Hello".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
|
||||
"Ściana".starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"Ściana".starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
'S\u{301}ciana'.starts_with 'ś' (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
'S\u{301}ciana'.starts_with 's\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
'S\u{301}ciana'.starts_with 's' (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
|
||||
"ABC" . starts_with "A" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"ABC" . starts_with "a" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"ABC" . starts_with "C" (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
"" . starts_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
"abc" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"" . starts_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"fOo FOO foo" . starts_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
|
||||
"Hello!".starts_with "he" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
|
||||
Test.specify "should allow for Regex starts_with checks" <|
|
||||
"Hello!".starts_with "[A-Z]" Regex_Matcher.new . should_be_true
|
||||
"foobar" . starts_with ".o." Regex_Matcher.new . should_be_true
|
||||
"foob" . starts_with ".f." Regex_Matcher.new . should_be_false
|
||||
|
||||
"123 meters and 4 centimeters" . starts_with "[0-9]+" Regex_Matcher.new . should_be_true
|
||||
"foo 123" . starts_with "[0-9]+" Regex_Matcher.new . should_be_false
|
||||
|
||||
# Correct non-regex behaviour for reference.
|
||||
'ś' . starts_with 's' == False
|
||||
's\u{301}' . starts_with 's' == False
|
||||
's\u{301}' . starts_with 'ś' == True
|
||||
'ś' . starts_with 's\u{301}' == True
|
||||
|
||||
# These two behave as expected.
|
||||
's\u{301}' . starts_with 'ś' (Regex_Matcher.new) == True
|
||||
'ś' . starts_with 's\u{301}' (Regex_Matcher.new) == True
|
||||
|
||||
## These two are included to document the current behaviour
|
||||
(even though ideally, we would want them to return False).
|
||||
'ś' . starts_with 's' (Regex_Matcher.new) == True
|
||||
's\u{301}' . starts_with 's' (Regex_Matcher.new) == True
|
||||
|
||||
"ściana" . starts_with "ś" Regex_Matcher.new . should_be_true
|
||||
"ściana" . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
|
||||
's\u{301}ciana' . starts_with 's\u{301}' Regex_Matcher.new . should_be_true
|
||||
's\u{301}ciana' . starts_with 'ś' Regex_Matcher.new . should_be_true
|
||||
|
||||
## These two tests below are disabled due to how regex is handling
|
||||
letters with accents. See the tests above for explanation.
|
||||
#"ściana" . starts_with "s" Regex_Matcher.new . should_be_false
|
||||
# 's\u{301}ciana' . starts_with 's' Regex_Matcher.new . should_be_false
|
||||
|
||||
"fOOBar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
|
||||
"faaaar" . starts_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
|
||||
|
||||
long_text = """
|
||||
EOL
|
||||
SOL Hmm...
|
||||
long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
|
||||
long_text . starts_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
|
||||
|
||||
"aaazzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
|
||||
"bbbzzz" . starts_with "a|b" Regex_Matcher.new . should_be_true
|
||||
"zzzaaa" . starts_with "a|b" Regex_Matcher.new . should_be_false
|
||||
"zzzbbb" . starts_with "a|b" Regex_Matcher.new . should_be_false
|
||||
"aaazzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
|
||||
"bbbzzz" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_true
|
||||
"zzzaaa" . starts_with "(a|b){2}" Regex_Matcher.new . should_be_false
|
||||
"ABC" . starts_with "\AA" Regex_Matcher.new . should_be_true
|
||||
"ABC" . starts_with "\AA\z" Regex_Matcher.new . should_be_false
|
||||
"foobar" . starts_with "" Regex_Matcher.new . should_be_true
|
||||
"" . starts_with "" Regex_Matcher.new . should_be_true
|
||||
|
||||
Test.specify "should check for ends_with using Unicode normalization" <|
|
||||
"Hello".ends_with "lo" . should_be_true
|
||||
"Hello".ends_with "LO" . should_be_false
|
||||
|
||||
"rzeczywistość".ends_with 'c\u{301}' . should_be_true
|
||||
"rzeczywistość".ends_with 'ć' . should_be_true
|
||||
"rzeczywistość".ends_with 'c' . should_be_false
|
||||
'rzeczywistos\u{301}c\u{301}'.ends_with 'ć' . should_be_true
|
||||
'rzeczywistos\u{301}c\u{301}'.ends_with 'c\u{301}' . should_be_true
|
||||
'rzeczywistos\u{301}c\u{301}'.ends_with 'c' . should_be_false
|
||||
|
||||
"ABC" . ends_with "C" . should_be_true
|
||||
"ABC" . ends_with "c" . should_be_false
|
||||
"" . ends_with "foo" . should_be_false
|
||||
"abc" . ends_with "" . should_be_true
|
||||
"" . ends_with "" . should_be_true
|
||||
"foo foo foo" . ends_with "foo" . should_be_true
|
||||
|
||||
Test.specify "ends_with should work as shown in the examples" <|
|
||||
"Hello World".ends_with "World" . should_be_true
|
||||
"Hello World".ends_with "world" . should_be_false
|
||||
"Hello World".ends_with "world" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"Hello World".ends_with "[A-Z][a-z]{4}" Regex_Matcher.new . should_be_true
|
||||
|
||||
Test.specify "should allow for case-insensitive ends_with checks" <|
|
||||
"Hello".ends_with "LO" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
|
||||
"rzeczywistość".ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"rzeczywistość".ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
'rzeczywistos\u{301}c\u{301}'.ends_with 'Ć' (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
'rzeczywistos\u{301}c\u{301}'.ends_with 'C\u{301}' (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
'rzeczywistos\u{301}c\u{301}'.ends_with 'C' (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
|
||||
"ABC" . ends_with "C" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"ABC" . ends_with "c" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"ABC" . ends_with "A" (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
"" . ends_with "foo" (Text_Matcher Case_Insensitive.new) . should_be_false
|
||||
"abc" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"" . ends_with "" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
"fOo FOO fOo" . ends_with "FoO" (Text_Matcher Case_Insensitive.new) . should_be_true
|
||||
|
||||
Test.specify "should allow for Regex ends_with checks" <|
|
||||
"Hello".ends_with "[a-z]" Regex_Matcher.new . should_be_true
|
||||
"Hello!".ends_with "[a-z]" Regex_Matcher.new . should_be_false
|
||||
|
||||
"foobar" . ends_with ".o." Regex_Matcher.new . should_be_false
|
||||
"foobar" . ends_with ".a." Regex_Matcher.new . should_be_true
|
||||
|
||||
"123 meters and 4 centimeters" . ends_with "[0-9]+" Regex_Matcher.new . should_be_false
|
||||
"foo 123" . ends_with "[0-9]+" Regex_Matcher.new . should_be_true
|
||||
|
||||
"rzeczywistość" . ends_with "ć" Regex_Matcher.new . should_be_true
|
||||
"rzeczywistość" . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
|
||||
'rzeczywistos\u{301}c\u{301}' . ends_with 'c\u{301}' Regex_Matcher.new . should_be_true
|
||||
'rzeczywistos\u{301}c\u{301}' . ends_with 'ć' Regex_Matcher.new . should_be_true
|
||||
"rzeczywistość" . ends_with "c" Regex_Matcher.new . should_be_false
|
||||
'rzeczywistos\u{301}c\u{301}' . ends_with 'c' Regex_Matcher.new . should_be_false
|
||||
|
||||
'rzeczywistos\u{301}c\u{301}' . ends_with 'Ć' (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
|
||||
"fOOBar" . ends_with ".A." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_true
|
||||
"faaaar" . ends_with ".o." (Regex_Matcher.new case_sensitive=Case_Insensitive.new) . should_be_false
|
||||
|
||||
long_text = """
|
||||
Hnnnn EOL
|
||||
SOL
|
||||
long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=True) . should_be_true
|
||||
long_text . ends_with "EOL.SOL" (Regex_Matcher.new dot_matches_newline=False) . should_be_false
|
||||
|
||||
"zzzaaa" . ends_with "a|b" Regex_Matcher.new . should_be_true
|
||||
"zzzbbb" . ends_with "a|b" Regex_Matcher.new . should_be_true
|
||||
"aaazzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
|
||||
"bbbzzz" . ends_with "a|b" Regex_Matcher.new . should_be_false
|
||||
"zzzaaa" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
|
||||
"zzzbbb" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_true
|
||||
"aaazzz" . ends_with "(a|b){2}" Regex_Matcher.new . should_be_false
|
||||
"ABC" . ends_with "C\z" Regex_Matcher.new . should_be_true
|
||||
"ABC" . ends_with "\AC\z" Regex_Matcher.new . should_be_false
|
||||
"foobar" . ends_with "" Regex_Matcher.new . should_be_true
|
||||
"" . ends_with "" Regex_Matcher.new . should_be_true
|
||||
|
||||
Test.group "Regex matching" <|
|
||||
Test.specify "should be possible on text" <|
|
||||
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Regex_Mode.First
|
||||
|
Loading…
Reference in New Issue
Block a user