Wip/gmt/match find only text (#5721)

Rename is_match + match to match + find (respectively), and remove all non-regexp functionality.

Regexp flags and Match_Mode are also no longer supported by these methods.
This commit is contained in:
GregoryTravis 2023-02-23 04:47:10 -05:00 committed by GitHub
parent 78aab133c7
commit 3a09ee88f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 104 additions and 222 deletions

View File

@ -318,6 +318,8 @@
- [Moved regex functionality out of `Text.locate` and `Text.locate_all` into
`Text.match` and `Text.match_all`.][5679]
- [`File.parent` may return `Nothing`.][5699]
- [Removed non-regex functionality from `is_match`, `match`, and `match_all`,
and renamed them to `match`, `find`, `find_all` (respectively).][5721]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -488,7 +490,9 @@
[5645]: https://github.com/enso-org/enso/pull/5645
[5646]: https://github.com/enso-org/enso/pull/5646
[5656]: https://github.com/enso-org/enso/pull/5656
[5679]: https://github.com/enso-org/enso/pull/5679
[5699]: https://github.com/enso-org/enso/pull/5699
[5721]: https://github.com/enso-org/enso/pull/5721
#### Enso Compiler

View File

@ -1,5 +1,7 @@
import project.Data.Locale.Locale
from project.Data.Boolean import Boolean, True, False
polyglot java import org.enso.base.text.TextFoldingStrategy
type Case_Sensitivity
@ -25,3 +27,11 @@ type Case_Sensitivity
Case_Sensitivity.Sensitive -> TextFoldingStrategy.unicodeNormalizedFold
Case_Sensitivity.Insensitive locale ->
TextFoldingStrategy.caseInsensitiveFold locale.java_locale
## PRIVATE
Is case insensitive.
is_case_insensitive : Boolean
is_case_insensitive self = case self of
Case_Sensitivity.Default -> False
Case_Sensitivity.Sensitive -> False
Case_Sensitivity.Insensitive _ -> True

View File

@ -13,6 +13,7 @@ import project.Data.Text.Encoding.Encoding
import project.Data.Text.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
import project.Data.Text.Regex_Matcher.Regex_Matcher
import project.Data.Text.Span.Span
@ -209,116 +210,77 @@ Text.characters self =
self.each bldr.append
bldr.to_vector
## ALIAS find
Matches the text in `self` against the provided `term`, returning the first
or last match if present or `Nothing` if there are no matches.
## Find the regular expression `pattern` in `self`, returning the first match
if present or `Nothing` if not found.
Arguments:
- term: The pattern to match `self` against. We recommend using _raw text_
to write your patterns.
- mode: This argument specifies whether the first or last match should be
returned.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options.
- pattern: The pattern to match `self` against.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
> Example
Find the first substring matching the regex.
example_match =
regex = "a[ab]c"
"aabbbbccccaabcaaaa".match regex == "abc"
example_find =
## This matches `abc` @ character 11
"aabbbbccccaabcaaaa".find "a[ab]c"
example_find_insensitive =
## This matches `aBc` @ character 11
"aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Compile_Error
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive
Regex.compile pattern case_insensitive=case_insensitive . match self Matching_Mode.First
! Last Match in Regex Mode
Regex always performs the search from the front and matching the last
occurrence means selecting the last of the matches while still generating
matches from the beginning. Regex does not return overlapping matches - it
will return a match at some position and then continue the search after that
match. This will lead to slightly different behavior for overlapping
occurrences of a pattern in Regex mode than in exact text matching mode
where the matches are searched for from the back.
> Example
Comparing Matching in Last Mode in Regex and Text mode
"aAa".match "aa" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive == "Aa"
"aAa".match "aa" mode=Matching_Mode.Last matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) == "aA"
Text.match : Text -> Matching_Mode -> (Text_Matcher | Regex_Matcher) -> Text | Nothing
Text.match self term mode=Matching_Mode.First matcher=Regex_Matcher.Value = case matcher of
_ : Text_Matcher ->
case_sensitivity = case matcher of
Text_Matcher.Case_Sensitive -> Case_Sensitivity.Sensitive
Text_Matcher.Case_Insensitive _ -> Case_Sensitivity.Insensitive
case self.locate term mode case_sensitivity of
Nothing -> Nothing
span -> span.text
_ : Regex_Matcher -> case mode of
Matching_Mode.First ->
case matcher.compile term . match self Matching_Mode.First of
Nothing -> Nothing
match -> match.span 0 . to_grapheme_span . text
Matching_Mode.Last ->
case matcher.compile term . match self Regex_Mode.All of
Nothing -> Nothing
matches -> matches.last.span 0 . to_grapheme_span . text
## ALIAS find_all
Matches all occurrences text in `self` against the provided `term`, returning
a vector of matches.
## Finds all the matches of the regular expression `pattern` in `self`,
returning a Vector. If not found, will be an empty Vector.
Arguments:
- term: The pattern to match `self` against. We recommend using _raw text_
to write your patterns.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options.
- pattern: The pattern to match `self` against.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
> Example
Find all substrings matching the regex.
Find the substring matching the regex.
example_match =
regex = "a[ab]c"
"aabcbbccaacaa".match regex == ["abc", "aac"]
Text.match_all : Text -> (Text_Matcher | Regex_Matcher) -> Vector Text
Text.match_all self term=".*" matcher=Regex_Matcher.Value = case matcher of
_ : Text_Matcher ->
case_sensitivity = case matcher of
Text_Matcher.Case_Sensitive -> Case_Sensitivity.Sensitive
Text_Matcher.Case_Insensitive _ -> Case_Sensitivity.Insensitive
self.locate_all term case_sensitivity . map .text
_ : Regex_Matcher ->
case matcher.compile term . match self Regex_Mode.All of
Nothing -> []
matches -> matches.map m-> m.span 0 . to_grapheme_span . text
example_find_all =
## This matches `aabbbbc` @ character 0 and `abc` @ character 11
"aabbbbccccaabcaaaa".find_all "a[ab]+c"
example_find_all_insensitive =
## This matches `aABbbbc` @ character 0 and `aBC` @ character 11
"aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Compile_Error
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive
case Regex.compile pattern case_insensitive=case_insensitive . match self Regex_Mode.All of
Nothing -> []
matches -> matches
## ALIAS Check Matches
Checks if the whole text in `self` matches a provided `pattern`.
Arguments:
- pattern: The pattern to match `self` against. We recommend using _raw text_
to write your patterns.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options.
- pattern: The pattern to match `self` against.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
> Example
Checks if some text matches a basic email regex. NOTE: This regex is _not_
compliant with RFC 5322.
Checks if whole text matches a basic email regex.
example_match =
regex = ".+@.+"
"contact@enso.org".is_match regex
Text.is_match : Text -> (Text_Matcher | Regex_Matcher) -> Boolean ! Compile_Error
Text.is_match self pattern=".*" matcher=Regex_Matcher.Value = case matcher of
Text_Matcher.Case_Sensitive -> self == pattern
Text_Matcher.Case_Insensitive locale -> self.equals_ignore_case pattern locale
_ : Regex_Matcher ->
compiled_pattern = matcher.compile pattern
compiled_pattern.matches self
regex = ".+ct@.+"
# Evaluates to true
"contact@enso.org".match regex
example_match_insensitive =
regex = ".+ct@.+"
# Evaluates to true
"CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
Text.match : Text -> Case_Sensitivity -> Boolean ! Compile_Error
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
compiled_pattern.matches self
## ALIAS Split Text
@ -1380,4 +1342,3 @@ slice_text text char_ranges =
char_ranges.map char_range->
sb.append text char_range.start char_range.end
sb.toString

View File

@ -57,7 +57,7 @@ type Suite_Config
should_run_group self name =
regexp = self.only_group_regexp
case regexp of
_ : Text -> name.is_match regexp . catch Any (_->True)
_ : Text -> name.match regexp . catch Any (_->True)
_ -> True
should_output_junit self =

View File

@ -57,6 +57,8 @@ type Manual
- Note that currently the regex-based operations may not handle the edge
cases described above too well.
spec =
check_span result span = result.span 0 . to_grapheme_span . should_equal span
check_span_all result spans = result . map (m-> (m.span 0).to_grapheme_span) . should_equal spans
Test.group "Text" <|
kshi = '\u0915\u094D\u0937\u093F'
facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
@ -1175,20 +1177,18 @@ spec =
Test.specify "should allow regexes in match" <|
hello = "Hello World!"
regex = Regex_Matcher.Value
regex_insensitive = Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive
hello.match ".o" Matching_Mode.First matcher=regex . should_equal "lo"
hello.match ".o" Matching_Mode.Last matcher=regex . should_equal "Wo"
hello.match_all ".o" matcher=regex . should_equal ["lo", "Wo"]
"foobar".match "BAR" Matching_Mode.First matcher=regex_insensitive . should_equal "bar"
check_span (hello.find ".o" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 5) "Hello World!")
check_span_all (hello.find_all ".o") [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
check_span ("foobar".find "BAR" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 6) "foobar")
## Regex matching does not do case folding
"Strasse".match "ß" Matching_Mode.First matcher=regex_insensitive . should_equal Nothing
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
## But it should handle the Unicode normalization
accents = 'a\u{301}e\u{301}o\u{301}'
accents.match accent_1 Matching_Mode.First matcher=regex . should_equal 'e\u{301}'
check_span (accents.find accent_1) (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
Test.specify "should correctly handle regex edge cases in locate" pending="Figure out how to make Regex correctly handle empty patterns." <|
regex = Regex_Matcher.Value
@ -1209,29 +1209,10 @@ spec =
"aaa aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (5.up_to 7) "aaa aaa")
Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
"abacadae".match_all "a[bc]" . should_equal ["ab", "ac"]
"abacadae".match_all "a." . should_equal ["ab", "ac", "ad", "ae"]
"abacadae".match_all "a.*" . should_equal ["abacadae"]
"abacadae".match_all "a.+?" . should_equal ["ab", "ac", "ad", "ae"]
"aAa".match "aa" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal "Aa"
"aAa".match "aa" mode=Matching_Mode.Last matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) . should_equal "aA"
"abacadae".match "a[bc]" mode=Matching_Mode.Last . should_equal "ac"
"abacadae".match "a." mode=Matching_Mode.Last . should_equal "ae"
"abacadae".match "a.*" mode=Matching_Mode.Last . should_equal "abacadae"
"abacadae".match "a.+?" mode=Matching_Mode.Last . should_equal "ae"
"abacadae".match "a[bc]" matcher=Text_Matcher.Case_Sensitive . should_equal Nothing
"abABacAC".match "ab" matcher=Text_Matcher.Case_Sensitive mode=Matching_Mode.Last . should_equal "ab"
"abABacAC".match "ab" matcher=Text_Matcher.Case_Insensitive mode=Matching_Mode.Last . should_equal "AB"
"abABacAC".match_all "ab" matcher=Text_Matcher.Case_Sensitive . should_equal ["ab"]
"abABacAC".match_all "ab" matcher=Text_Matcher.Case_Insensitive . should_equal ["ab", "AB"]
"abacadae".match_all "a[bc]" matcher=Text_Matcher.Case_Sensitive . should_equal []
"Strasse and Straße".match_all "STRASSE" matcher=Text_Matcher.Case_Sensitive . should_equal []
"Strasse and Straße".match_all "STRASSE" matcher=Text_Matcher.Case_Insensitive . should_equal ["Strasse", "Straße"]
check_span_all ("abacadae".find_all "a[bc]") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
check_span_all ("abacadae".find_all "a.") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
check_span_all ("abacadae".find_all "a.*") [Span.Value (0.up_to 8) "abacadae"]
check_span_all ("abacadae".find_all "a.+?") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
Test.specify "should default to exact matching for locate but regex for match" <|
txt = "aba[bc]adacae"
@ -1242,131 +1223,57 @@ spec =
txt.locate "a[bc]" . should_equal (Span.Value (2.up_to 7) txt)
txt.locate_all "a[bc]" . should_equal [Span.Value (2.up_to 7) txt]
"ab".match "a[bc]" . should_equal "ab"
"a[bc]".match "a[bc]" . should_equal Nothing
"a[bc]".match_all "a[bc]" . should_equal []
check_span ("ab".find "a[bc]") (Span.Value (0.up_to 2) "ab")
"a[bc]".find "a[bc]" . should_equal Nothing
"a[bc]".find_all "a[bc]" . should_equal []
txt.match "a[bc]" . should_equal "ab"
txt.match_all "a[bc]" . should_equal ["ab", "ac"]
check_span (txt.find "a[bc]") (Span.Value (0.up_to 2) "aba[bc]adacae")
check_span_all (txt.find_all "a[bc]") [Span.Value (0.up_to 2) "aba[bc]adacae", Span.Value (9.up_to 11) "aba[bc]adacae"]
Test.group "Regex matching" <|
Test.specify "should be possible on text" <|
match = "My Text: Goes Here".match "^My Text: (.+)$"
match.should_equal "My Text: Goes Here"
match = "My Text: Goes Here".find "^My Text: (.+)$"
check_span match (Span.Value (0.up_to 18) "My Text: Goes Here")
Test.specify "should be possible on unicode text" <|
txt = "maza건반zaa"
txt.match "^a..z$" . should_equal Nothing
txt.match "^m..a..z.a$" . should_equal txt
txt.match "a..z" . should_equal "a건반z"
Test.specify "should be possible in ascii mode" <|
match = "İ".match "\w" matcher=(Regex_Matcher.Value match_ascii=True)
match.should_equal Nothing
txt.find "^a..z$" . should_equal Nothing
check_span (txt.find "^m..a..z.a$") (Span.Value (0.up_to 9) txt)
check_span (txt.find "a..z") (Span.Value (3.up_to 7) txt)
Test.specify "should be possible in case-insensitive mode" <|
match = "MY".match "my" matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive)
match.should_equal "MY"
match = "MY".find "my" Case_Sensitivity.Insensitive
check_span match (Span.Value (0.up_to 2) "MY")
Test.specify "should be possible in dot_matches_newline mode" <|
match = 'Foo\n'.match "(....)" matcher=(Regex_Matcher.Value dot_matches_newline=True)
match.should_equal 'Foo\n'
Test.specify "should be possible in multiline mode" <|
text = """
Foo
bar
match = text.match_all "^(...)$" matcher=(Regex_Matcher.Value multiline=True)
match.should_equal ["Foo", "bar"]
Test.specify "should be possible in comments mode" <|
match = "abcde".match "(..) # Match two of any character" matcher=(Regex_Matcher.Value comments=True)
match.should_equal "ab"
Test.group "Text.is_match" <|
Test.group "Text.match" <|
Test.specify "should default to regex" <|
"My Text: Goes Here".is_match "^My Text: (.+)$" . should_be_true
"555-801-1923".is_match "^\d{3}-\d{3}-\d{4}$" . should_be_true
"Hello".is_match "^[a-z]+$" . should_be_false
"Hello".is_match "^[a-z]+$" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) . should_be_true
"My Text: Goes Here".match "^My Text: (.+)$" . should_be_true
"555-801-1923".match "^\d{3}-\d{3}-\d{4}$" . should_be_true
"Hello".match "^[a-z]+$" . should_be_false
"Hello".match "^[a-z]+$" Case_Sensitivity.Insensitive . should_be_true
Test.specify "should only match whole input" <|
"Hello".is_match "[a-z]" . should_be_false
"x".is_match "[a-z]" . should_be_true
Test.specify "should allow Text_Matcher too" <|
"foobar".is_match "foobar" matcher=Text_Matcher.Case_Sensitive . should_be_true
"foobar".is_match "FOOBAR" matcher=Text_Matcher.Case_Sensitive . should_be_false
"foobar".is_match "foo.*" matcher=Text_Matcher.Case_Sensitive . should_be_false
"foobar".is_match "foo" matcher=Text_Matcher.Case_Sensitive . should_be_false
"foobar".is_match "foobar" matcher=Text_Matcher.Case_Insensitive . should_be_true
"foobar".is_match "FOOBAR" matcher=Text_Matcher.Case_Insensitive . should_be_true
"foobar".is_match "foo.*" matcher=Text_Matcher.Case_Insensitive . should_be_false
"foobar".is_match "foo" matcher=Text_Matcher.Case_Insensitive . should_be_false
"Hello".match "[a-z]" . should_be_false
"x".match "[a-z]" . should_be_true
Test.specify "should be possible on unicode text" <|
"Korean: 건반".is_match "^Korean: (.+)$" . should_be_true
Test.specify "should be possible in ascii mode" <|
"İ".is_match "\w" (Regex_Matcher.Value match_ascii=True) . should_be_false
"Korean: 건반".match "^Korean: (.+)$" . should_be_true
Test.specify "should be possible in case-insensitive mode" <|
"MY".is_match "my" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) . should_be_true
Test.specify "should be possible in dot_matches_newline mode" <|
'Foo\n'.is_match "(....)" (Regex_Matcher.Value dot_matches_newline=True) . should_be_true
multiline_matches_message = """
This test does not make sense once we require matches to match the
whole string. The `multiline` parameter may not make sense for the
`matches` function. This should be revisited when Text library is
being redesigned.
Test.specify "should be possible in multiline mode" pending=multiline_matches_message <|
text = """
Foo
bar
text.is_match "^(...)$" (Regex_Matcher.Value multiline=True) . should_be_true
Test.specify "should be possible in comments mode" <|
"abcde".is_match "(.....) # Match any five characters" (Regex_Matcher.Value comments=True) . should_be_true
"MY".match "my" Case_Sensitivity.Insensitive . should_be_true
Test.group "Regex finding" <|
Test.specify "should be possible on text" <|
match = "My Text: Goes Here".match "^My Text: (.+)$" mode=Matching_Mode.First
match . should_be_a Text
match . should_equal "My Text: Goes Here"
match = "My Text: Goes Here".find "^My Text: (.+)$"
check_span match (Span.Value (0.up_to 18) "My Text: Goes Here")
Test.specify "should be possible on unicode text" <|
match = "Korean: 건반".match "^Korean: (.+)$" mode=Matching_Mode.First
match . should_be_a Text
match . should_equal "Korean: 건반"
Test.specify "should be possible in ascii mode" <|
match = "İ".match "\w" matcher=(Regex_Matcher.Value match_ascii=True)
match . should_equal Nothing
match = "Korean: 건반".find "^Korean: (.+)$"
check_span match (Span.Value (0.up_to 10) "Korean: 건반")
Test.specify "should be possible in case-insensitive mode" <|
match = "MY".match "my" matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive)
match . should_be_a Text
match . should_equal "MY"
Test.specify "should be possible in dot_matches_newline mode" <|
match = 'Foo\n'.match "(....)" matcher=(Regex_Matcher.Value dot_matches_newline=True)
match . should_be_a Text
match . should_equal 'Foo\n'
Test.specify "should be possible in multiline mode" <|
text = """
Foo
bar
match = text.match_all "^(...)$" matcher=(Regex_Matcher.Value multiline=True)
match . should_equal ["Foo", "bar"]
Test.specify "should be possible in comments mode" <|
match = "abcde".match "(..) # Match two of any character" matcher=(Regex_Matcher.Value comments=True)
match . should_be_a Text
match . should_equal "ab"
match = "MY".find "my" Case_Sensitivity.Insensitive
check_span match (Span.Value (0.up_to 2) "MY")
Test.group "Regex splitting" <|
Test.specify "should be possible on text" <|