Implement Regular Expression replace and update Text.replace to the new API (#5959)

Re-implement replace on top of Truffle regex.
This commit is contained in:
GregoryTravis 2023-03-28 02:13:12 -04:00 committed by GitHub
parent 9bec3a4e71
commit 6b9cbeacb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 876 additions and 681 deletions

View File

@ -363,6 +363,8 @@
- [Aligned names of columns created by column operations.][5850] - [Aligned names of columns created by column operations.][5850]
- [Improved `cross_tab`. Renamed `fill_missing` and `is_missing` to - [Improved `cross_tab`. Renamed `fill_missing` and `is_missing` to
`fill_nothing` and `is_nothing`. Added `fill_empty`.][5863] `fill_nothing` and `is_nothing`. Added `fill_empty`.][5863]
- [Removed many regex compile flags from `replace`; added `only_first`
flag.][5959]
[debug-shortcuts]: [debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -550,6 +552,7 @@
[5863]: https://github.com/enso-org/enso/pull/5863 [5863]: https://github.com/enso-org/enso/pull/5863
[5917]: https://github.com/enso-org/enso/pull/5917 [5917]: https://github.com/enso-org/enso/pull/5917
[5705]: https://github.com/enso-org/enso/pull/5705 [5705]: https://github.com/enso-org/enso/pull/5705
[5959]: https://github.com/enso-org/enso/pull/5959
#### Enso Compiler #### Enso Compiler

View File

@ -10,6 +10,7 @@ import project.Data.Range.Range
import project.Data.Text.Case.Case import project.Data.Text.Case.Case
import project.Data.Text.Case_Sensitivity.Case_Sensitivity import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Data.Text.Encoding.Encoding import project.Data.Text.Encoding.Encoding
import project.Data.Text.Helpers
import project.Data.Text.Location.Location import project.Data.Text.Location.Location
import project.Data.Text.Matching_Mode.Matching_Mode import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex.Match.Match import project.Data.Text.Regex.Match.Match
@ -218,6 +219,10 @@ Text.characters self =
- case_sensitivity: Specifies if the text values should be compared case - case_sensitivity: Specifies if the text values should be compared case
sensitively. sensitively.
If an empty regex is used, `find` throws an Illegal_Argument error.
If a non-default locale is used, `find` throws an Illegal_Argument error.
> Example > Example
Find the first substring matching the regex. Find the first substring matching the regex.
@ -227,10 +232,12 @@ Text.characters self =
example_find_insensitive = example_find_insensitive =
## This matches `aBc` @ character 11 ## This matches `aBc` @ character 11
"aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive "aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
Helpers.regex_assume_default_locale case_sensitivity <|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory case_insensitive = case_sensitivity.is_case_insensitive_in_memory
Regex_2.compile pattern case_insensitive=case_insensitive . match self compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.if_not_error <| compiled_pattern.match self
## Finds all the matches of the regular expression `pattern` in `self`, ## Finds all the matches of the regular expression `pattern` in `self`,
returning a Vector. If not found, will be an empty Vector. returning a Vector. If not found, will be an empty Vector.
@ -240,6 +247,10 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
- case_sensitivity: Specifies if the text values should be compared case - case_sensitivity: Specifies if the text values should be compared case
sensitively. sensitively.
If an empty regex is used, `find_all` throws an Illegal_Argument error.
If a non-default locale is used, `find_all` throws an Illegal_Argument error.
> Example > Example
Find the substring matching the regex. Find the substring matching the regex.
@ -249,10 +260,12 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
example_find_all_insensitive = example_find_all_insensitive =
## This matches `aABbbbc` @ character 0 and `aBC` @ character 11 ## This matches `aABbbbc` @ character 0 and `aBC` @ character 11
"aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive "aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
Helpers.regex_assume_default_locale case_sensitivity <|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory case_insensitive = case_sensitivity.is_case_insensitive_in_memory
Regex_2.compile pattern case_insensitive=case_insensitive . match_all self compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.if_not_error <| compiled_pattern.match_all self
## ALIAS Check Matches ## ALIAS Check Matches
@ -263,6 +276,10 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
- case_sensitivity: Specifies if the text values should be compared case - case_sensitivity: Specifies if the text values should be compared case
sensitively. sensitively.
If an empty regex is used, `match` throws an Illegal_Argument error.
If a non-default locale is used, `match` throws an Illegal_Argument error.
> Example > Example
Checks if whole text matches a basic email regex. Checks if whole text matches a basic email regex.
@ -274,11 +291,12 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
regex = ".+ct@.+" regex = ".+ct@.+"
# Evaluates to true # Evaluates to true
"CONTACT@enso.org".match regex Case_Sensitivity.Insensitive "CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
Helpers.regex_assume_default_locale case_sensitivity <|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.matches self compiled_pattern.if_not_error <| compiled_pattern.matches self
## ALIAS Split Text ## ALIAS Split Text
@ -327,21 +345,31 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
compiled_pattern.split self mode=Regex_Mode.All compiled_pattern.split self mode=Regex_Mode.All
## ALIAS Replace Text ## ALIAS Replace Text
Replaces the first, last, or all occurrences of term with new_text in the Perform a text or regex replace.
input. If `term` is empty, the function returns the input unchanged.
Returns the text with all matched elements replaced by the provided
replacement. If `input` is empty, the function returns the input unchanged.
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
Arguments: Arguments:
- term: The term to find. - term: The string or regex to find.
- new_text: The new text to replace occurrences of `term` with. - replacement: The text to replace matches with.
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement - case_insensitive: Enables or disables case-insensitive matching. Case
patterns (such as `$<n>`) for a marked group. insensitive matching behaves as if it normalises the case of all input
- mode: Specifies which occurences of term the engine tries to find. When the text before matching on it.
mode is `First` or `Last`, this method replaces the first or last occurence - only_first: If True, only replace the first match.
of term in the input. If set to `All`, it replaces all occurences of term in - use_regex: If true, the term is used as a regular expression.
the input.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity If an empty regex is used, `replace` throws an Illegal_Argument error.
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options. If a non-default locale is used with a regex, `replace` throws an
Illegal_Argument error.
> Example > Example
Replace letters in the text "aaa". Replace letters in the text "aaa".
@ -351,17 +379,17 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
> Example > Example
Replace all occurrences of letters 'l' and 'o' with '#'. Replace all occurrences of letters 'l' and 'o' with '#'.
"Hello World!".replace "[lo]" "#" matcher=Regex_Matcher == "He### W#r#d!" "Hello World!".replace "[lo]" "#" use_regex=True == "He### W#r#d!"
> Example > Example
Replace the first occurrence of letter 'l' with '#'. Replace the first occurrence of letter 'l' with '#'.
"Hello World!".replace "l" "#" mode=Matching_Mode.First == "He#lo World!" "Hello World!".replace "l" "#" only_first=True == "He#lo World!"
> Example > Example
Replace texts in quotes with parentheses. Replace texts in quotes with parentheses.
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher == '(abc) foo (bar) baz' '"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True == '(abc) foo (bar) baz'
! Matching Grapheme Clusters ! Matching Grapheme Clusters
In case-insensitive mode, a single character can match multiple characters, In case-insensitive mode, a single character can match multiple characters,
@ -379,61 +407,39 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
Extended partial matches in case-insensitive mode. Extended partial matches in case-insensitive mode.
# The ß symbol matches the letter `S` twice in case-insensitive mode, because it folds to `ss`. # The ß symbol matches the letter `S` twice in case-insensitive mode, because it folds to `ss`.
'ß'.replace 'S' 'A' matcher=(Text_Matcher Case_Insensitive) . should_equal 'AA' 'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA'
# The 'ffi' ligature is a single grapheme cluster, so even if just a part of it is matched, the whole grapheme is replaced. # The 'ffi' ligature is a single grapheme cluster, so even if just a part of it is matched, the whole grapheme is replaced.
'affib'.replace 'i' 'X' matcher=(Text_Matcher Case_Insensitive) . should_equal 'aXb' 'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
! Last Match in Regex Mode
Regex always performs the search from the front and matching the last
occurrence means selecting the last of the matches while still generating
matches from the beginning. Regex does not return overlapping matches - it
will return a match at some position and then continue the search after that
match. This will lead to slightly different behavior for overlapping
occurrences of a pattern in Regex mode than in exact text matching mode
where the matches are searched for from the back.
> Example > Example
Comparing Matching in Last Mode in Regex and Text mode Regexp replace.
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "ac" '<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' use_regex=True == 'content is at url'
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "ca"
"aaa aaa".replace "aa" "c" matcher=Text_Matcher . should_equal "ca ca" Text.replace : Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Text ! Illegal_Argument
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher . should_equal "ca aaa" Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False =
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "aaa ac" case use_regex of
"aaa aaa".replace "aa" "c" matcher=Regex_Matcher . should_equal "ca ca" False -> if term.is_empty then self else
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "aaa ca"
Text.replace : Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Text
Text.replace self term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive = if term.is_empty then self else
case matcher of
_ : Text_Matcher ->
array_from_single_result result = case result of array_from_single_result result = case result of
Nothing -> Array.empty Nothing -> Array.empty
_ -> Array.new_1 result _ -> Array.new_1 result
spans_array = case matcher of spans_array = case case_sensitivity of
Text_Matcher.Case_Sensitive -> case mode of Case_Sensitivity.Sensitive -> case only_first of
Regex_Mode.All -> False -> Text_Utils.span_of_all self term
Text_Utils.span_of_all self term True -> array_from_single_result <| Text_Utils.span_of self term
Matching_Mode.First -> Case_Sensitivity.Insensitive locale -> case only_first of
array_from_single_result <| Text_Utils.span_of self term False ->
Matching_Mode.Last ->
array_from_single_result <| Text_Utils.last_span_of self term
_ -> Error.throw (Illegal_Argument.Error "Invalid mode.")
Text_Matcher.Case_Insensitive locale -> case mode of
Regex_Mode.All ->
Text_Utils.span_of_all_case_insensitive self term locale.java_locale Text_Utils.span_of_all_case_insensitive self term locale.java_locale
Matching_Mode.First -> True ->
array_from_single_result <| array_from_single_result <|
Text_Utils.span_of_case_insensitive self term locale.java_locale False Text_Utils.span_of_case_insensitive self term locale.java_locale False
Matching_Mode.Last -> Text_Utils.replace_spans self spans_array replacement
array_from_single_result <| True ->
Text_Utils.span_of_case_insensitive self term locale.java_locale True Helpers.regex_assume_default_locale case_sensitivity <|
_ -> Error.throw (Illegal_Argument.Error "Invalid mode.") case_insensitive = case_sensitivity.is_case_insensitive_in_memory
Text_Utils.replace_spans self spans_array new_text compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
_ : Regex_Matcher -> compiled_pattern.if_not_error <|
compiled_pattern = matcher.compile term compiled_pattern.replace self replacement only_first
compiled_pattern.replace self new_text mode=mode
## ALIAS Get Words ## ALIAS Get Words
@ -1115,9 +1121,9 @@ Text.trim self where=Location.Both what=_.is_whitespace =
term = "straße" term = "straße"
text = "MONUMENTENSTRASSE 42" text = "MONUMENTENSTRASSE 42"
match = text . locate term matcher=(Text_Matcher Case_Insensitive) match = text . locate term case_sensitivity=Case_Sensitivity.Insensitive
term.length == 6 term.length . should_equal 6
match.length == 7 match.length . should_equal 7
! Matching Grapheme Clusters ! Matching Grapheme Clusters
In case-insensitive mode, a single character can match multiple characters, In case-insensitive mode, a single character can match multiple characters,
@ -1265,11 +1271,8 @@ Text.locate_all self term="" case_sensitivity=Case_Sensitivity.Sensitive = if te
- term: The term to find. - term: The term to find.
- start: The index to start searching from. If the index is negative, it - start: The index to start searching from. If the index is negative, it
is counted from the end of the vector. is counted from the end of the vector.
- matcher: Specifies how the term is matched against the input: - case_sensitivity: Specifies if the text values should be compared case
- If a `Text_Matcher`, the text is compared using case-sensitively rules sensitively.
specified in the matcher.
- If a `Regex_Matcher`, the `term` is used as a regular expression and
matched using the associated options.
! What is a Character? ! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode A character is defined as an Extended Grapheme Cluster, see Unicode
@ -1301,11 +1304,8 @@ Text.index_of self term="" start=0 case_sensitivity=Case_Sensitivity.Sensitive =
- term: The term to find. - term: The term to find.
- start: The index to start searching backwards from. If the index is - start: The index to start searching backwards from. If the index is
negative, it is counted from the end of the vector. negative, it is counted from the end of the vector.
- matcher: Specifies how the term is matched against the input: - case_sensitivity: Specifies if the text values should be compared case
- If a `Text_Matcher`, the text is compared using case-sensitively rules sensitively.
specified in the matcher.
- If a `Regex_Matcher`, the `term` is used as a regular expression and
matched using the associated options.
! What is a Character? ! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode A character is defined as an Extended Grapheme Cluster, see Unicode

View File

@ -0,0 +1,16 @@
from Standard.Base import all
import project.Any.Any
import project.Data.Locale.Locale
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Errors.Illegal_Argument.Illegal_Argument
## PRIVATE
regex_assume_default_locale : Case_Sensitivity -> Any -> Any ! Illegal_Argument
regex_assume_default_locale case_sensitivity ~action = case case_sensitivity of
Case_Sensitivity.Sensitive -> action
Case_Sensitivity.Insensitive locale -> case locale == Locale.default of
True -> action
False ->
msg = "Custom locales are not supported for regexes."
Error.throw (Illegal_Argument.Error msg)

View File

@ -8,8 +8,8 @@ import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text import project.Data.Text.Text
import project.Data.Vector.Vector import project.Data.Vector.Vector
import project.Errors.Common.Index_Out_Of_Bounds
import project.Error.Error import project.Error.Error
import project.Errors.Common.Index_Out_Of_Bounds
import project.Nothing.Nothing import project.Nothing.Nothing
import project.Panic.Panic import project.Panic.Panic
@ -48,9 +48,9 @@ type Match_2
Arguments: Arguments:
- group: the group name or number. Marked groups defined in the regex are - group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match. numbered starting at 1; group 0 refers to the entire match.
utf16_start : Integer | Text -> Integer utf_16_start : Integer | Text -> Integer
utf16_start self group=0 = utf_16_start self group=0 =
span = self.span group span = self.utf_16_span group
if span.is_nothing then Nothing else span.start if span.is_nothing then Nothing else span.start
## Returns the end UTF16 character index, plus one, of a group. ## Returns the end UTF16 character index, plus one, of a group.
@ -58,9 +58,9 @@ type Match_2
Arguments: Arguments:
- group: the group name or number. Marked groups defined in the regex are - group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match. numbered starting at 1; group 0 refers to the entire match.
utf16_end : Integer | Text -> Integer utf_16_end : Integer | Text -> Integer
utf16_end self group=0 = utf_16_end self group=0 =
span = self.span group span = self.utf_16_span group
if span.is_nothing then Nothing else span.end if span.is_nothing then Nothing else span.end
## Returns the start grapheme index of a group. ## Returns the start grapheme index of a group.
@ -75,7 +75,7 @@ type Match_2
numbered starting at 1; group 0 refers to the entire match. numbered starting at 1; group 0 refers to the entire match.
start : Integer | Text -> Integer start : Integer | Text -> Integer
start self group=0 = start self group=0 =
span = self.grapheme_span group span = self.span group
if span.is_nothing then Nothing else span.start if span.is_nothing then Nothing else span.start
## Returns the end grapheme index, plus one, of a group. ## Returns the end grapheme index, plus one, of a group.
@ -90,7 +90,7 @@ type Match_2
numbered starting at 1; group 0 refers to the entire match. numbered starting at 1; group 0 refers to the entire match.
end : Integer | Text -> Integer end : Integer | Text -> Integer
end self group=0 = end self group=0 =
span = self.grapheme_span group span = self.span group
if span.is_nothing then Nothing else span.end if span.is_nothing then Nothing else span.end
## Gets the UTF16 span matched by the group with the provided identifier, or ## Gets the UTF16 span matched by the group with the provided identifier, or
@ -120,9 +120,9 @@ type Match_2
In this case, the group id for "(d)", which is 3, is a valid group id and In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3, (Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
Match_2.group will return the default value. Match_2.utf_16_span will return the default value.
span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
span self group=0 ~default=Nothing = utf_16_span self group=0 ~default=Nothing =
group_id = self.pattern.lookup_group group group_id = self.pattern.lookup_group group
start = self.internal_start group_id start = self.internal_start group_id
end = self.internal_end group_id end = self.internal_end group_id
@ -158,10 +158,10 @@ type Match_2
In this case, the group id for "(d)", which is 3, is a valid group id and In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get (Pattern_2.lookup_group 3) will return 3. If the caller tries to get
group 3, Match_2.group will return the default value. group 3, Match_2.span will return the default value.
grapheme_span : Integer | Text -> Any -> Span ! No_Such_Group span : Integer | Text -> Any -> Span ! No_Such_Group
grapheme_span self group=0 ~default=Nothing = span self group=0 ~default=Nothing =
result = self.span group Nothing result = self.utf_16_span group Nothing
if result.is_nothing then default else result.to_grapheme_span if result.is_nothing then default else result.to_grapheme_span
## Gets the Text matched by the group with the provided identifier, or ## Gets the Text matched by the group with the provided identifier, or
@ -186,10 +186,10 @@ type Match_2
In this case, the group id for "(d)", which is 3, is a valid group id and In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get (Pattern_2.lookup_group 3) will return 3. If the caller tries to get
group 3, Match_2.group will return the default value. group 3, Match_2.text will return the default value.
text : Integer | Text -> Any -> Text ! No_Such_Group text : Integer | Text -> Any -> Text ! No_Such_Group
text self group=0 ~default=Nothing = text self group=0 ~default=Nothing =
result = self.grapheme_span group Nothing result = self.span group Nothing
if result.is_nothing then default else result.text if result.is_nothing then default else result.text
## Gets a vector containing the Text of _all_ of the capturing groups in ## Gets a vector containing the Text of _all_ of the capturing groups in
@ -208,6 +208,16 @@ type Match_2
If the regex contained named groups, these may also be accessed by If the regex contained named groups, these may also be accessed by
index based on their position in the pattern. index based on their position in the pattern.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. `groups` will return the
default value for groups that do not participate.
> Example > Example
Get a vector of the text matched by all of the groups in this match, Get a vector of the text matched by all of the groups in this match,
replacing the value for groups that didn't match with "UNMATCHED". replacing the value for groups that didn't match with "UNMATCHED".
@ -237,8 +247,8 @@ type Match_2
"ab((c)|(d))".find "abc" "ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3, (Pattern_2.lookup_group 3) will return 3. `named_groups` will map
Match_2.group will return the default value. a named group that does not participate to the default value.
> Example > Example
Get the map of all of the named groups in this match, replacing the Get the map of all of the named groups in this match, replacing the
@ -261,7 +271,7 @@ type Match_2
Arguments: Arguments:
- id: The integer index or name of that group. - id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds. - if_missing: The value to return if the index is out of bounds.
get : Integer -> Any -> Any get : Integer -> Any -> Text | Any
get self index ~if_missing=Nothing = get self index ~if_missing=Nothing =
self.text index . catch No_Such_Group (_-> if_missing) self.text index . catch No_Such_Group (_-> if_missing)
@ -272,6 +282,6 @@ type Match_2
Arguments: Arguments:
- id: The integer index or name of that group. - id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds. - if_missing: The value to return if the index is out of bounds.
at : Integer -> Any ! Index_Out_Of_Bounds at : Integer -> Text ! Index_Out_Of_Bounds
at self index = at self index =
self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count)) self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count))

View File

@ -6,17 +6,19 @@ import project.Data.Range.Range
import project.Data.Text.Span.Span import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match_2.Match_2 import project.Data.Text.Regex.Match_2.Match_2
import project.Data.Text.Regex.Replacer.Replacer
import project.Data.Text.Regex_2.No_Such_Group import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Text import project.Data.Text.Text
import project.Data.Vector.Vector import project.Data.Vector.Vector
import project.Error.Error import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Meta import project.Meta
import project.Nothing.Nothing import project.Nothing.Nothing
import project.Panic.Panic
import project.Polyglot.Polyglot import project.Polyglot.Polyglot
from project.Data.Boolean import Boolean, True, False from project.Data.Boolean import Boolean, True, False
polyglot java import org.enso.base.Replacer_Cache
polyglot java import org.enso.base.Text_Utils polyglot java import org.enso.base.Text_Utils
type Pattern_2 type Pattern_2
@ -50,13 +52,15 @@ type Pattern_2
## Tries to match the provided `input` against the pattern `self`. ## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Match_2` objects, each containing the matched text Returns a `Vector Match_2` object, each containing the matched text
and its match groups. and its match groups.
Arguments: Arguments:
- input: The text to match the pattern described by `self` against. - input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match_2 match_all : Text -> Vector Match_2 ! Illegal_Argument
match_all self input = match_all self input =
pattern_is_empty = self.internal_regex_object.pattern == ''
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
builder = Vector.new_builder builder = Vector.new_builder
it = Match_Iterator.new self input it = Match_Iterator.new self input
go it = case it.next of go it = case it.next of
@ -89,6 +93,82 @@ type Pattern_2
find_all self input = find_all self input =
self.match_all input . map match_to_group_maybe self.match_all input . map match_to_group_maybe
## ADVANCED
Replace all occurrences of the pattern described by `self` in the `input`
with the specified `replacement`.
Arguments:
- input: The text in which to perform the replacement(s).
- replacement: The literal text with which to replace any matches.
- only_first: If True, only replace the first match.
If this method performs no replacements it will return the `input` text
unchanged.
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
> Example
Replace letters in the text "aa".
pattern = Regex_2.compile 'aa'
pattern.replace 'aaa' 'b' == 'ba'
> Example
Replace all occurrences of letters 'l' and 'o' with '#'.
pattern = Regex_2.compile '[lo]'
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
> Example
Replace the first occurrence of letter 'l' with '#'.
pattern = Regex_2.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
> Example
Replace texts in quotes with parentheses.
pattern = Regex_2.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
> Example
Replace a literal string with a replacement value.
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match == "xyz ab xyz ac ad xyz xyz ax"
> Example
Replace each word with the same word surrounded by `[]`.
pattern = Regex_2.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
replace : Text -> Text -> Boolean -> Text
replace self input replacement only_first=False =
it = Match_Iterator.new self input
case it of
Match_Iterator_Value.Last filler -> filler.text
_ ->
replacer = Replacer.new replacement self
replacer.if_not_error <|
go next current = case next of
Match_Iterator_Value.Next filler match next_it ->
new_value = current + filler.text + (replacer.replace match)
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next new_value
Match_Iterator_Value.Last filler ->
current + filler.text
go it.next ""
## PRIVATE ## PRIVATE
Look up a match group name or number, and check that it is valid. Look up a match group name or number, and check that it is valid.
@ -106,6 +186,9 @@ type Pattern_2
A group name is an alias for a group number; if a name is passed to A group name is an alias for a group number; if a name is passed to
this method, it returns the corresponding group number. this method, it returns the corresponding group number.
If a group number is passed to `lookup_group` and it is valid, it will
simply return the group number.
Note that it is possible for a group to "not participate in the match", Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails. does not participate -- it neither matches nor fails.
@ -138,6 +221,15 @@ type Pattern_2
_ : Integer -> n _ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name) Nothing -> Error.throw (No_Such_Group.Error name)
## PRIVATE
Return a lazy iterator over matches against a string.
Arguments
- text: the string to match against.
iterator : Text -> Match_Iterator
iterator self input = Match_Iterator.new self input
## Return the number of groups in the underlying RegexObject. ## Return the number of groups in the underlying RegexObject.
Note, the count includes group 0 (the whole match) as well. Note, the count includes group 0 (the whole match) as well.
group_count : Integer group_count : Integer
@ -154,32 +246,51 @@ type Pattern_2
Performs the regex match, and iterates through the results. Yields both Performs the regex match, and iterates through the results. Yields both
the matched parts of the string, and the 'filler' parts between them. the matched parts of the string, and the 'filler' parts between them.
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
matches and replacement boundaries can fall in the middle of multi-
character graphemes, thereby splitting them apart.
At each step, it yields a Match_Iterator_Value, whivch has either a filler At each step, it yields a Match_Iterator_Value, whivch has either a filler
and a match, or just the final filler. A Match_Iterator_Value.Last value is and a match, or just the final filler. A Match_Iterator_Value.Last value is
return at the end, and only at the end. return at the end, and only at the end.
Optionally, you can call `early_exit` to have it return the remainder of
the string, unmatched, as a single Last value. (Used for `replace` with
`only_first=True`.)
type Match_Iterator type Match_Iterator
new : Pattern_2 -> Text -> Match_Iterator new : Pattern_2 -> Text -> Match_Iterator
new pattern input = Match_Iterator.Value pattern input 0 new pattern input = Match_Iterator.Value pattern input 0
Value (pattern : Pattern_2) (input : Text) (cursor : Integer) Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
## Return the next match, or the last filler string if there is no
additional match.
Also returns the next iterator, if there was a match.
next : Match_Iterator_Value next : Match_Iterator_Value
next self = next self =
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
case regex_result.isMatch of case regex_result.isMatch of
False -> False ->
filler_range = Range.new self.cursor (Text_Utils.char_length self.input) filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span filler_span = (Utf_16_Span.Value filler_range self.input)
Match_Iterator_Value.Last filler_span Match_Iterator_Value.Last filler_span
True -> True ->
match_start = regex_result.getStart 0 match_start = regex_result.getStart 0
filler_range = Range.new self.cursor match_start filler_range = Range.new self.cursor match_start
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span filler_span = (Utf_16_Span.Value filler_range self.input)
match = Match_2.Value self.pattern regex_result self.input match = Match_2.Value self.pattern regex_result self.input
next_cursor = match.utf16_end 0 next_cursor = match.utf_16_end 0
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
Match_Iterator_Value.Next filler_span match next_iterator Match_Iterator_Value.Next filler_span match next_iterator
## Returns the remainder of the string, unmatched.
early_exit : Match_Iterator_Value
early_exit self =
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = Utf_16_Span.Value filler_range self.input
Match_Iterator_Value.Last filler_span
to_text_debug : Vector Text to_text_debug : Vector Text
to_text_debug self = to_text_debug self =
vb = Vector.new_builder vb = Vector.new_builder

View File

@ -0,0 +1,144 @@
import project.Data.Numbers.Integer
import project.Data.Text.Extensions
import project.Data.Text.Regex.Match_2.Match_2
import project.Data.Text.Regex.Pattern_2.Match_Iterator_Value
import project.Data.Text.Regex.Pattern_2.Pattern_2
import project.Data.Text.Regex_2
import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Illegal_State.Illegal_State
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
polyglot java import java.lang.StringBuilder
polyglot java import org.enso.base.Replacer_Cache
type Replacer
## PRIVATE
Implements a replacement for a regular expression.
Pattern_2.replace uses a Replacer to replace each regex match with
a replacement string. This string can contain references to match
groups from the original regex.
The `new` smart constructor parses a Text into a vector of
Replacements. Each Replacement is either a literal string or a
group number. To provide a replacement for a regex match, the
Replacer iterates through the Replacement vector, substitutes
the match group contents for each group number, and concatenates
all the strings together to form the full replacement string.
Value (replacement : Vector Replacement)
## Creates a new Replacer.
Arguments
- replacement_string: a string, possibly containing group references,
that will be used to provide a replacement in a regex match.
new : Text -> Pattern_2 -> Replacer ! No_Such_Group
new replacement_string pattern =
Replacer.Value (build_replacement_vector_cached replacement_string pattern)
## Build a replacement string from a match.
Arguments:
- match: the match from the original string that is to be replaced.
replace : Match_2 -> Text
replace self match =
string_builder = StringBuilder.new
self.replacement.each replacement->
s = case replacement of
Replacement.Literal text -> text
Replacement.Substitution group_number -> match.text group_number
string_builder.append s
string_builder.toString
## PRIVATE
Get the size of the Replacer LRU cache. For testing.
get_lru_size : Integer
get_lru_size = Replacer_Cache.getLruSize
## PRIVATE
Look up a replacement string in the Replacer LRU cache. For testing.
replacer_cache_lookup : Text -> Replacer | Nothing
replacer_cache_lookup replacement_string = Replacer_Cache.get replacement_string
## PRIVATE
group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))"
## PRIVATE
Build a replacement vector.
Parse the replacement string into an alternating series of literal
strings and group reference numbers.
Uses Replacement_Cache to avoid rebuilding the vector for recently used
replacement strings.
build_replacement_vector_cached : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
build_replacement_vector_cached replacement_string pattern =
Replacer_Cache.get_or_set replacement_string _->
build_replacement_vector replacement_string pattern
## PRIVATE
Build a replacement vector.
Parse the replacement string into an alternating series of literal
strings and group reference numbers.
build_replacement_vector : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
build_replacement_vector replacement_string pattern =
replacement_pattern = Regex_2.compile group_reference_regex
it = replacement_pattern.iterator replacement_string
builder = Vector.new_builder
go it = case it.next of
Match_Iterator_Value.Next filler match next_it ->
replacement = parse_group_number pattern match
replacement.if_not_error <|
builder.append (Replacement.Literal filler.text)
builder.append replacement
@Tail_Call go next_it
Match_Iterator_Value.Last filler ->
builder.append (Replacement.Literal filler.text)
result = go it
result.if_not_error <|
builder.to_vector
## PRIVATE
Parse a capture group reference.
Arguments:
- pattern: the Pattern_2 used to initiate the replacement. This is used
to identify and validate capture groups.
- match: the match of the replacement string against group_reference_regex.
Returns a Replacement: a group number, or, in the case of `$$`, a literal.
See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
parse_group_number : Pattern_2 -> Match_2 -> Replacement ! No_Such_Group
parse_group_number pattern match = case match.text.take 2 of
"$$" -> Replacement.Literal "$"
"$<" ->
# Group 6 contains the group name without the `<>`.
group_name = match.text 6
Replacement.Substitution (pattern.lookup_group group_name)
"$&" -> Replacement.Substitution 0
_ ->
n = Integer.parse <| match.text 2
Replacement.Substitution (pattern.lookup_group n)
type Replacement
## A string literal to replace with.
Literal (text : Text)
## Target group to insert.
Substitution (group_number : Integer)

View File

@ -7,6 +7,7 @@ import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument import project.Errors.Illegal_Argument.Illegal_Argument
import project.Nothing.Nothing import project.Nothing.Nothing
import project.Panic.Panic import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False from project.Data.Boolean import Boolean, True, False
from project.Errors.Common import Syntax_Error from project.Errors.Common import Syntax_Error
@ -17,18 +18,21 @@ polyglot java import java.util.regex.Pattern as Java_Pattern
Arguments Arguments
- expression: The text representing the regular expression that you want to - expression: The text representing the regular expression that you want to
compile. compile. Must be non-empty.
- case_insensitive: Enables or disables case-insensitive matching. Case - case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input insensitive matching behaves as if it normalises the case of all input
text before matching on it. text before matching on it.
If an empty regex is used, `compile` throws an Illegal_Argument error.
? Why Compile? ? Why Compile?
While many regex engines are able to cache ad-hoc patterns, it is often While many regex engines are able to cache ad-hoc patterns, it is often
useful to be able to manually retain a pattern that you have computed. This useful to be able to manually retain a pattern that you have computed. This
function exists so you can hold onto the resultant `Pattern_2` object, function exists so you can hold onto the resultant `Pattern_2` object,
instead of immediately proceeding to match using it. instead of immediately proceeding to match using it.
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error | Illegal_Argument
compile self expression case_insensitive=Nothing = compile self expression case_insensitive=Nothing =
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
options_string = if case_insensitive == True then "usgi" else "usg" options_string = if case_insensitive == True then "usgi" else "usg"
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic-> internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->

View File

@ -843,8 +843,8 @@ type Table
parse_problem_builder.attach_problems_before on_problems <| parse_problem_builder.attach_problems_before on_problems <|
Table.new new_columns Table.new new_columns
## Replaces the first, last, or all occurrences of `term` with ## Replaces the first, or all occurrences of `term` with `new_text` in each
`new_text` in each text row of selected columns. text row of selected columns.
If `term` is empty, the function returns the table unchanged. If `term` is empty, the function returns the table unchanged.
This method follows the exact replacement semantics of the This method follows the exact replacement semantics of the
@ -854,15 +854,13 @@ type Table
- columns: Column selection criteria or a column name or index. - columns: Column selection criteria or a column name or index.
- term: The term to find. - term: The term to find.
- new_text: The new text to replace occurrences of `term` with. - new_text: The new text to replace occurrences of `term` with.
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement If use_regex is true, `new_text` can include replacement patterns
patterns (such as `$<n>`) for a marked group. (such as `$<n>`) for a marked group.
- mode: Specifies which occurences of term the engine tries to find. When the - case_insensitive: Enables or disables case-insensitive matching. Case
mode is `First` or `Last`, this method replaces the first or last occurence insensitive matching behaves as if it normalises the case of all input
of term in each individual table cell. If set to `All`, it replaces all text before matching on it.
occurences of term. - only_first: If True, only replace the first match.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity - use_regex: If true, the term is used as a regular expression.
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options.
- on_problems: Specifies how to handle if a problem occurs, raising as a - on_problems: Specifies how to handle if a problem occurs, raising as a
warning by default. warning by default.
@ -881,21 +879,21 @@ type Table
> Example > Example
Remove leading and trailing spaces from cells in multiple columns. Remove leading and trailing spaces from cells in multiple columns.
table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" matcher=Regex_Matcher.Value table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" use_regex=True
> Example > Example
Replace texts in quotes with parentheses in column at index 1. Replace texts in quotes with parentheses in column at index 1.
table.replace_text 1 '"(.*?)"' '($1)' matcher=Regex_Matcher.Value table.replace_text 1 '"(.*?)"' '($1)' use_regex=True
replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Problem_Behavior -> Table replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Problem_Behavior -> Table
replace_text self columns=[0] term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else replace_text self columns=[0] term="" new_text="" case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
problem_builder = Problem_Builder.new problem_builder = Problem_Builder.new
selection = self.columns_helper.select_columns_helper columns reorder=False problem_builder selection = self.columns_helper.select_columns_helper columns reorder=False problem_builder
selected_names = Map.from_vector (selection.map column-> [column.name, True]) selected_names = Map.from_vector (selection.map column-> [column.name, True])
map_preserve_name column f = column.map f . rename column.name map_preserve_name column f = column.map f . rename column.name
do_replace = _.replace term new_text mode matcher do_replace = _.replace term new_text case_sensitivity=case_sensitivity only_first=only_first use_regex=use_regex
do_replace_only_text = case _ of do_replace_only_text = case _ of
item : Text -> do_replace item item : Text -> do_replace item
item -> item item -> item

View File

@ -18,7 +18,7 @@ type Naming_Helpers
sanitize_name : Text -> Text sanitize_name : Text -> Text
sanitize_name name = sanitize_name name =
# Using the regex matcher due to the #5831 bug. # Using the regex matcher due to the #5831 bug.
name.replace '\0' '\\\\0' matcher=Regex_Matcher.Value name.replace '\0' '\\0' use_regex=True
## PRIVATE ## PRIVATE
Generates a column name for a binary operation. Generates a column name for a binary operation.

View File

@ -0,0 +1,51 @@
package org.enso.base;
import org.graalvm.collections.Pair;
import org.graalvm.polyglot.Value;
import java.util.ArrayList;
import java.util.function.Function;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Replacer_Cache {
private static final int lruSize = 5;
// Circular buffer containing the most recent cache keys.
private static final List<Pair<String, Value>> lru = new ArrayList<>(lruSize);
static {
for (int i = 0; i < lruSize; ++i) {
lru.add(null);
}
}
// Index into the circular buffer.
private static int nextSlot = 0;
public static Value get_or_set(String key, Function<Void, Value> value_producer) {
Value value = get(key);
if (value == null) {
value = value_producer.apply(null);
lru.set(nextSlot, Pair.create(key, value));
nextSlot = (nextSlot + 1) % lruSize;
}
return value;
}
// Visible for testing.
public static Value get(String key) {
for (int i = 0; i < lruSize; ++i) {
Pair<String, Value> pair = lru.get(i);
if (pair != null && pair.getLeft().equals(key)) {
return lru.get(i).getRight();
}
}
return null;
}
public static int getLruSize() {
return lruSize;
}
}

View File

@ -644,7 +644,7 @@ spec =
bools = ["bools", [False, False, True, True]] bools = ["bools", [False, False, True, True]]
texts = ["texts", ["foo", "bar", "baz", "spam"]] texts = ["texts", ["foo", "bar", "baz", "spam"]]
table = Table.new [bools, texts] table = Table.new [bools, texts]
actual = table.replace_text "texts" "(a|o)" "$1e" matcher=Regex_Matcher.Value actual = table.replace_text "texts" "(a|o)" "$1e" use_regex=True
actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"] actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"]
Problems.assume_no_problems actual Problems.assume_no_problems actual

View File

@ -1,199 +1,98 @@
from Standard.Base import all from Standard.Base import all
import Standard.Base.Data.Text.Span.Span
import Standard.Base.Data.Text.Span.Utf_16_Span import Standard.Base.Data.Text.Span.Utf_16_Span
import Standard.Base.Data.Text.Regex_2
import Standard.Base.Data.Text.Regex.Match_2.Match_2 import Standard.Base.Data.Text.Regex.Match_2.Match_2
import Standard.Base.Data.Text.Regex.Pattern_2.Pattern_2
import Standard.Base.Data.Text.Regex.Replacer.Replacer
import Standard.Base.Data.Text.Regex_2
import Standard.Base.Data.Text.Regex_2.No_Such_Group import Standard.Base.Data.Text.Regex_2.No_Such_Group
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
from Standard.Test import Test, Test_Suite from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions import Standard.Test.Extensions
# default_mask = Java_Pattern.CANON_EQ.bit_or Java_Pattern.UNICODE_CASE . bit_or Java_Pattern.UNICODE_CHARACTER_CLASS polyglot java import org.enso.base.Replacer_Cache
spec = spec =
## Test.group "Compile" <|
Test.group "The default regex engine's options handling" <| Test.specify "should be able to be compiled" <|
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
pattern . should_be_a Pattern_2
Test.specify "should convert options to Java" <| Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
options = [Regex_Option.Comments, Regex_Option.Multiline, Default_Engine.Option.Unix_Lines] Regex_2.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
expected_mask = Java_Pattern.UNIX_LINES.bit_or Java_Pattern.COMMENTS . bit_or Java_Pattern.MULTILINE . bit_or default_mask
actual_mask = Default_Engine.from_enso_options options
actual_mask . should_equal expected_mask Test.specify "should disallow empty patterns in `compile`" <|
Regex_2.compile "" . should_fail_with Illegal_Argument
Test.specify "should specify the unicode options by default" <|
actual_mask = Default_Engine.from_enso_options []
actual_mask . should_equal default_mask
Test.specify "should handle ascii matching by disabling unicode" <|
actual_mask = Default_Engine.from_enso_options [Regex_Option.Ascii_Matching]
actual_mask . should_equal 0
Test.specify "should result in an error when an option is invalid" <|
Default_Engine.from_enso_options [""] . should_fail_with Invalid_Option
Default_Engine.from_enso_options ["", Regex_Option.Ascii_Matching] . should_fail_with Invalid_Option
Test.group "The default regex engine (Default_Engine)" <|
Test.specify "should be able to compile patterns with no options" <|
engine = Default_Engine.new
pattern = engine.compile "^a$" []
pattern.engine . should_equal engine
pattern.options . should_equal []
pattern.internal_pattern.flags . should_equal default_mask
Test.specify "should be able to compile patterns with global options" <|
engine = Default_Engine.new
pattern = engine.compile "^a$" [Regex_Option.Multiline]
pattern.engine . should_equal engine
pattern.options . should_equal [Regex_Option.Multiline]
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.MULTILINE)
Test.specify "should be able to compile patterns with engine-specific options" <|
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
pattern = engine.compile "^a$" []
pattern.engine . should_equal engine
pattern.options . should_equal [Default_Engine.Option.Literal_Pattern]
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.LITERAL)
Test.specify "should be able to compile patterns with combined options" <|
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
pattern = engine.compile "^a$" [Regex_Option.Comments]
pattern.engine . should_equal engine
pattern.options.contains Default_Engine.Option.Literal_Pattern . should_be_true
pattern.options.contains Regex_Option.Comments . should_be_true
pattern.internal_pattern.flags . should_equal (default_mask . bit_or Java_Pattern.LITERAL . bit_or Java_Pattern.COMMENTS)
Test.specify "should return a syntax error of the regex syntax is invalid" <|
engine = Default_Engine.new
engine.compile "^(a" [] . should_fail_with Syntax_Error
Test.specify "should throw an invalid options error if an option is invalid" <|
engine = Default_Engine.new
engine.compile "^a$" ["invalid"] . should_fail_with Invalid_Option
Test.group "Escape" <|
Test.specify "should escape an expression for use as a literal" <| Test.specify "should escape an expression for use as a literal" <|
pattern = "http://example.com" pattern = "http://example.com"
engine = Default_Engine.new Regex_2.escape pattern . should_equal "\Qhttp://example.com\E"
engine.escape pattern . should_equal "\Qhttp://example.com\E"
Test.group "The default regex engine's Pattern.matches" <|
engine = Default_Engine.new
Test.group "Pattern.matches" <|
Test.specify "should return True when the pattern matches against the input" <| Test.specify "should return True when the pattern matches against the input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" [] pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
pattern.matches input . should_be_true pattern.matches input . should_be_true
Test.specify "should return False when the pattern doesn't match against the input" <| Test.specify "should return False when the pattern doesn't match against the input" <|
pattern = engine.compile "aaz" [] pattern = Regex_2.compile "aaz"
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
pattern.matches input . should_be_false pattern.matches input . should_be_false
Test.specify "should check for full matches" <| Test.specify "should check for full matches" <|
pattern = engine.compile "f.o" [] pattern = Regex_2.compile "f.o"
pattern.matches "foo" . should_be_true pattern.matches "foo" . should_be_true
pattern.matches "foobar" . should_be_false pattern.matches "foobar" . should_be_false
Test.group "The default regex engine's Pattern.match" <| Test.specify "`matches` with an empty pattern should be an error" <|
engine = Default_Engine.new pattern = Regex_2.compile ""
pattern.matches "ABC" . should_fail_with Illegal_Argument
Test.group "Pattern.match" <|
Test.specify "should be able to `match` the first instance of the pattern in the input" <| Test.specify "should be able to `match` the first instance of the pattern in the input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" [] pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First match = pattern.match input
match . should_be_a Default_Engine.Match.Value match . should_be_a Match_2
match.text 0 . should_equal input match.text 0 . should_equal input
Test.specify "should return `Nothing` if there are no matches in first mode" <| Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" [] pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "abc" input = "abc"
match = pattern.match input mode=Matching_Mode.First match = pattern.match input
match . should_equal Nothing
Test.specify "should be able to `match` at most N instances of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=3
match.length . should_equal 3
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
Test.specify "should `match` fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "(..)" []
input = "abcdef"
match = pattern.match input mode=5
match.length . should_equal 3
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
Test.specify "should return `Nothing` when a counted match fails" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.match input mode=3
match . should_equal Nothing match . should_equal Nothing
Test.specify "should be able to `match` the all instances of the pattern in the input" <| Test.specify "should be able to `match` the all instances of the pattern in the input" <|
pattern = engine.compile "(..)" [] pattern = Regex_2.compile "(..)"
input = "abcdefghij" input = "abcdefghij"
match = pattern.match input mode=Regex_Mode.All matches = pattern.match_all input
match.length . should_equal 5 matches.length . should_equal 5
match.at 0 . group 0 . should_equal "ab" matches.at 0 . text 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd" matches.at 1 . text 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef" matches.at 2 . text 0 . should_equal "ef"
match.at 3 . group 0 . should_equal "gh" matches.at 3 . text 0 . should_equal "gh"
match.at 4 . group 0 . should_equal "ij" matches.at 4 . text 0 . should_equal "ij"
Test.specify "should return `Nothing` when an all match match fails" <| Test.specify "should return `[]` when an all match match fails" <|
pattern = engine.compile "(aa)" [] pattern = Regex_2.compile "(aa)"
input = "abcdefghij" input = "abcdefghij"
match = pattern.match input mode=Regex_Mode.All match = pattern.match_all input
match . should_equal Nothing match . should_equal []
Test.specify "should be able to `match` the pattern against the entire input" <| Test.specify "`match` with an empty pattern should be an error" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" [] pattern = Regex_2.compile ""
input = "aa ab abc a bc bcd" pattern.match "ABC" . should_fail_with Illegal_Argument
match = pattern.match input mode=Regex_Mode.Full
match . should_be_a Default_Engine.Match.Value
match.text 0 . should_equal input
Test.specify "should return `Nothing` if a full match does not match the entire input" <| Test.specify "`match_all` with an empty pattern should be an error" <|
pattern = engine.compile "(..)" [] pattern = Regex_2.compile ""
input = "aa ab" pattern.match_all "ABC" . should_fail_with Illegal_Argument
full_match = pattern.match input mode=Regex_Mode.Full
full_match . should_equal Nothing
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.specify "should be able to `match` the pattern against bounded input" <| Test.group "Pattern_2.find and .find_all" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=(Regex_Mode.Bounded 2 8)
match.length . should_equal 3
match.at 0 . text 0 . should_equal "cd"
match.at 1 . text 0 . should_equal "ef"
match.at 2 . text 0 . should_equal "gh"
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
pattern = engine.compile "" []
match_1 = pattern.match "" mode=Regex_Mode.All
match_1.length . should_equal 1
match_1.at 0 . start 0 . should_equal 0
match_1.at 0 . end 0 . should_equal 0
match_2 = pattern.match "ABC" mode=Regex_Mode.All
match_2.length . should_equal 4
match_2.at 0 . start 0 . should_equal 0
match_2.at 0 . end 0 . should_equal 0
match_2.at 1 . start 0 . should_equal 1
match_2.at 1 . end 0 . should_equal 1
match_2.at 3 . start 0 . should_equal 3
match_2.at 3 . end 0 . should_equal 3
Test.group "The default regex engine's Pattern.find" <|
Test.specify "should be able to `find` the first instance of the pattern in the input" <| Test.specify "should be able to `find` the first instance of the pattern in the input" <|
pattern = Regex_2.compile "(..)" pattern = Regex_2.compile "(..)"
input = "abcdefghij" input = "abcdefghij"
@ -229,6 +128,14 @@ spec =
Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"] Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"] Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"]
Test.specify "`find` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.find "ABC" . should_fail_with Illegal_Argument
Test.specify "`find_all` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.find_all "ABC" . should_fail_with Illegal_Argument
## ##
Test.group "The default regex engine's Pattern.split" <| Test.group "The default regex engine's Pattern.split" <|
engine = Default_Engine.new engine = Default_Engine.new
@ -279,142 +186,122 @@ spec =
match.at 3 . should_equal "e" match.at 3 . should_equal "e"
match.at 4 . should_equal "f" match.at 4 . should_equal "f"
Test.group "The default regex engine's Pattern.replace" <| Test.group "Pattern_2.replace" <|
engine = Default_Engine.new
Test.specify "should be able to `replace` the first instance of the pattern in the input" <| Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
pattern = engine.compile "abc" [] pattern = Regex_2.compile "abc"
input = "aa ab abc a bc abc" input = "aa ab abc a bc abc"
match = pattern.replace input "REPLACED" mode=Matching_Mode.First match = pattern.replace input "REPLACED" only_first=True
match . should_be_a Text match . should_be_a Text
match . should_equal "aa ab REPLACED a bc abc" match . should_equal "aa ab REPLACED a bc abc"
Test.specify "should return the string unchanged if there are no matches to replace in first mode" <| Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <|
pattern = engine.compile "xyz" [] pattern = Regex_2.compile "xyz"
input = "aa ab ac ad" input = "aa ab ac ad"
match = pattern.replace input "REPLACED" mode=Matching_Mode.First match = pattern.replace input "REPLACED" only_first=True
match . should_equal input
Test.specify "should be able to replace at most N instances of the pattern in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=3
match . should_equal "REPLACED ab REPLACED ac ad REPLACED aa ax"
Test.specify "should replace fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=10
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when a counted replace fails" <|
pattern = engine.compile "aa" []
input = "abcdefghij"
match = pattern.replace input "REPLACED" mode=3
match . should_equal input match . should_equal input
Test.specify "should be able to replace the all instances of the pattern in the input" <| Test.specify "should be able to replace the all instances of the pattern in the input" <|
pattern = engine.compile "aa" [] pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax" input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All match = pattern.replace input "REPLACED"
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when an all replace fails" <| Test.specify "should return the input when an all replace fails" <|
pattern = engine.compile "aa" [] pattern = Regex_2.compile "aa"
input = "abcdefghij" input = "abcdefghij"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All match = pattern.replace input "REPLACED"
match . should_equal input match . should_equal input
Test.specify "should be able to replace the entire input only if it matches" <| Test.specify "should be able to replace the entire input only if it matches" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" [] pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.replace input "REPLACED" mode=Regex_Mode.Full match = pattern.replace input "REPLACED"
match . should_equal "REPLACED" match . should_equal "REPLACED"
Test.specify "should correctly replace entire input in Full mode even if partial matches are possible" <|
pattern = engine.compile "(aa)+" []
pattern.replace "aaa" "REPLACED" mode=Regex_Mode.Full . should_equal "aaa"
pattern.replace "aaaa" "REPLACED" mode=Regex_Mode.Full . should_equal "REPLACED"
Test.specify "should return the input for a full replace if the pattern doesn't match the entire input" <|
pattern = engine.compile "(..)" []
input = "aa ab"
full_match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
full_match . should_equal input
Test.specify "should not perform overlapping replacements in counted mode" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
result = pattern.replace input "REPLACED" mode=3
result . should_equal "REPLACEDREPLACEDREPLACEDghij"
Test.specify "should not perform overlapping replacements in all mode" <| Test.specify "should not perform overlapping replacements in all mode" <|
pattern = engine.compile "(..)" [] pattern = Regex_2.compile "(..)"
input = "aa ab" input = "aa ab"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All match = pattern.replace input "REPLACED"
match . should_equal "REPLACEDREPLACEDb" match . should_equal "REPLACEDREPLACEDb"
Test.specify "should handle capture groups in replacement" <| Test.specify "should handle capture groups in replacement" <|
pattern = engine.compile "(?<capture>[a-z]+)" [] pattern = Regex_2.compile "(?<capture>[a-z]+)"
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]" pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=0 . should_equal "foo bar, baz" pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=1 . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=2 . should_equal "[foo] [bar], baz"
pattern.replace "foo bar, baz" "[$1]" mode=3 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=4 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]" pattern.replace "foo bar, baz" "[$<capture>]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=0 . should_equal "foo bar, baz" pattern.replace "foo bar, baz" "[$<capture>]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=1 . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=2 . should_equal "[foo] [bar], baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=3 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=4 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
Test.specify "should handle capture groups in replacement in All mode" <| pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]"
pattern = engine.compile "([a-z]+)" [] pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.Full . should_equal "foo bar, baz" pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo" "[$1]" mode=Regex_Mode.Full . should_equal "[foo]" pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz"
pattern_2 = engine.compile '<a href="(?<addr>.*?)">(?<name>.*?)</a>' [] Test.specify "should handle unicode in capture group names" <|
pattern_2.replace '<a href="url">content</a>' "$2 <- $1" mode=Regex_Mode.Full . should_equal "content <- url" pattern = Regex_2.compile "(?<건반>[a-z]+)"
pattern_2.replace '<a href="url">content</a>' "${name} <- ${addr}" mode=Regex_Mode.Full . should_equal "content <- url" pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]"
Test.group "Match.group" <| Text.group "should correctly evaluate documentation examples" <|
engine = Default_Engine.new Test.specify "example 1" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" [] pattern = Regex_2.compile 'aa'
pattern.replace 'aaa' 'b' . should_equal 'ba'
Test.specify "example 2" <|
pattern = Regex_2.compile '[lo]'
pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!'
Test.specify "example 3" <|
pattern = Regex_2.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!'
Test.specify "example 4" <|
pattern = Regex_2.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz'
Test.specify "example 5" <|
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match . should_equal "xyz ab xyz ac ad xyz xyz ax"
Test.specify "example 6" <|
pattern = Regex_2.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
Test.specify "`replace` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.replace "ABC" . should_fail_with Illegal_Argument
Test.group "Match.text" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First match = pattern.match input
match . should_be_a Default_Engine.Match.Value match . should_be_a Match_2
Test.specify "should return the full match with index 0" <| Test.specify "should return the full match with index 0" <|
match.group 0 . should_equal "aa ab abc a bc bcd" match.text 0 . should_equal "aa ab abc a bc bcd"
Test.specify "should return the group contents if it matches by index" <| Test.specify "should return the group contents if it matches by index" <|
match.group 1 . should_equal "aa ab " match.text 1 . should_equal "aa ab "
Test.specify "should return the group contents if it matches by name" <| Test.specify "should return the group contents if it matches by name" <|
match.group "letters" . should_equal "abc a bc bcd" match.text "letters" . should_equal "abc a bc bcd"
Test.specify "should return Nothing if the group did not match" <| Test.specify "should return Nothing if the group did not match" <|
match.group 3 . should_equal Nothing match.text 3 . should_equal Nothing
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <| Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
match.group "fail" . should_fail_with No_Such_Group match.text "fail" . should_fail_with No_Such_Group
match.group 5 . should_fail_with No_Such_Group match.text 5 . should_fail_with No_Such_Group
Test.specify "should make named groups accessible by index" <| Test.specify "should make named groups accessible by index" <|
match.group 2 . should_equal (match.group "letters") match.text 2 . should_equal (match.text "letters")
Test.group "Match.groups" <| Test.group "Match.groups" <|
engine = Default_Engine.new pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First match = pattern.match input
match . should_be_a Default_Engine.Match.Value match . should_be_a Match_2
Test.specify "should return the results of all groups" <| Test.specify "should return the results of all groups" <|
groups = match.groups groups = match.groups
@ -485,59 +372,57 @@ spec =
match.end 5 . should_fail_with No_Such_Group match.end 5 . should_fail_with No_Such_Group
match.end "nonexistent" . should_fail_with No_Such_Group match.end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf16_start" <| Test.group "Match.utf_16_start" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.match input match = pattern.match input
match . should_be_a Match_2 match . should_be_a Match_2
Test.specify "should return the start of a group by index" <| Test.specify "should return the start of a group by index" <|
match.utf16_start 1 . should_equal 0 match.utf_16_start 1 . should_equal 0
Test.specify "should return the start of a group by name" <| Test.specify "should return the start of a group by name" <|
match.utf16_start "letters" . should_equal 6 match.utf_16_start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <| Test.specify "should return Nothing if the group didn't match" <|
match.utf16_start 3 . should_equal Nothing match.utf_16_start 3 . should_equal Nothing
match.utf16_start "empty" . should_equal Nothing match.utf_16_start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf16_start 5 . should_fail_with No_Such_Group match.utf_16_start 5 . should_fail_with No_Such_Group
match.utf16_start "nonexistent" . should_fail_with No_Such_Group match.utf_16_start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf16_end" <| Test.group "Match.utf_16_end" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.match input match = pattern.match input
match . should_be_a Match_2 match . should_be_a Match_2
Test.specify "should return the end of a group by index" <| Test.specify "should return the end of a group by index" <|
match.utf16_end 1 . should_equal 6 match.utf_16_end 1 . should_equal 6
Test.specify "should return the end of a group by name" <| Test.specify "should return the end of a group by name" <|
match.utf16_end "letters" . should_equal 18 match.utf_16_end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <| Test.specify "should return Nothing if the group didn't match" <|
match.utf16_end 3 . should_equal Nothing match.utf_16_end 3 . should_equal Nothing
match.utf16_end "empty" . should_equal Nothing match.utf_16_end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf16_end 5 . should_fail_with No_Such_Group match.utf_16_end 5 . should_fail_with No_Such_Group
match.utf16_end "nonexistent" . should_fail_with No_Such_Group match.utf_16_end "nonexistent" . should_fail_with No_Such_Group
##
Test.group "Match.span" <| Test.group "Match.span" <|
engine = Default_Engine.new pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First match = pattern.match input
match . should_be_a Default_Engine.Match.Value match . should_be_a Match_2
Test.specify "should get the span of a group by index" <| Test.specify "should get the span of a group by index" <|
match.span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input) match.span 1 . should_equal (Span.Value (0.up_to 6) input)
Test.specify "should get the span of a group by name" <| Test.specify "should get the span of a group by name" <|
match.span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input) match.span "letters" . should_equal (Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <| Test.specify "should return Nothing if the group didn't match" <|
match.span 3 . should_equal Nothing match.span 3 . should_equal Nothing
@ -547,45 +432,35 @@ spec =
match.span 5 . should_fail_with No_Such_Group match.span 5 . should_fail_with No_Such_Group
match.span "nonexistent" . should_fail_with No_Such_Group match.span "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.start_position" <| Test.group "Match.utf_16_span" <|
engine = Default_Engine.new pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd" input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First match = pattern.match input
match . should_be_a Default_Engine.Match.Value match . should_be_a Match_2
Test.specify "should return the region start over which self match was performed" <| Test.specify "should get the UTF16 span of a group by index" <|
match.start_position . should_equal 0 match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
Test.group "Match.end_position" <| Test.specify "should get the UTF16 span of a group by name" <|
engine = Default_Engine.new match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the region end over which self match was performed" <| Test.specify "should return Nothing if the group didn't match" <|
match.end_position . should_equal 18 match.utf_16_span 3 . should_equal Nothing
match.utf_16_span "empty" . should_equal Nothing
Test.group "Regex options handling" <| Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
Test.specify "should work properly with flag options" <| match.utf_16_span 5 . should_fail_with No_Such_Group
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[] match.utf_16_span "nonexistent" . should_fail_with No_Such_Group
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
Test.specify "should properly override vector options" <| Test.group "caching" <|
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[Regex_Option.Multiline, Regex_Option.Case_Insensitive] Test.specify "Replacer cache drops old values" <|
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Case_Insensitive, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments] pattern = Regex_2.compile('([a-c])')
Test.group "Regexes" <| # Add enough values to flush out the first values.
Test.specify "should be able to be compiled" <| 0.up_to get_lru_size+1 . map i->
pattern = Regex.compile "(?<dots>..)" case_insensitive=True result = pattern.replace "abcdef" ("$1$1x" + i.to_text)
pattern . should_be_a Default_Engine.Pattern.Value result . should_not_equal Nothing
pattern.options . should_equal [Regex_Option.Case_Insensitive] replacer_cache_lookup "$1$1x0" . should_equal Nothing
replacer_cache_lookup "$1$1x1" . should_not_equal Nothing
Test.specify "should be able to be escaped" <|
pattern = "http://example.com"
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
## TODO: Missing tests for No_Such_Group_Error
main = Test_Suite.run_main spec main = Test_Suite.run_main spec

View File

@ -7,6 +7,7 @@ import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Common.Incomparable_Values import Standard.Base.Errors.Common.Incomparable_Values
import Standard.Base.Errors.Common.Type_Error import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.IO
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
@ -15,6 +16,7 @@ from Standard.Base.Data.Index_Sub_Range.Index_Sub_Range import all
from Standard.Test import Test, Test_Suite from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions import Standard.Test.Extensions
import Standard.Base.Data.Text.Extensions
type Auto type Auto
Value a Value a
@ -1190,9 +1192,9 @@ spec =
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing "Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
Test.specify "find should produce correct spans" <| Test.specify "find should produce correct spans" <|
"Hello World!".find ".o" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!") "Hello World!".find ".o" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!")
"Hello World!".find_all ".o" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"] "Hello World!".find_all ".o" . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
"foobar".find "BAR" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 6) "foobar") "foobar".find "BAR" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 6) "foobar")
Test.specify "should handle accents and other multi-point graphemes" <| Test.specify "should handle accents and other multi-point graphemes" <|
accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h' accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
@ -1201,29 +1203,20 @@ spec =
accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}' accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}'
# Check both UTF16 spans # Check both UTF16 spans
accents.find_all 'h' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents] accents.find_all 'h' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents]
accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents] accents.find_all 'e\u{301}' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents]
# Check both grapheme spans # Check both grapheme spans
accents.find_all 'h' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents] accents.find_all 'h' . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
accents.find_all 'e\u{301}' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents] accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
# Check contents to make sure the spans' ranges are ok # Check contents to make sure the spans' ranges are ok
accents.find 'h' . text 0 . should_equal 'h' accents.find 'h' . text 0 . should_equal 'h'
accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}' accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}'
Test.specify "should correctly handle regex edge cases in locate" pending="Figure out how to make Regex correctly handle empty patterns." <| Test.specify "should correctly handle regex edge cases in `find`" <|
regex = Regex_Matcher.Value "".find "foo" . should_equal Nothing
"".match "foo" matcher=regex . should_equal Nothing "".find_all "foo" . should_equal []
"".match "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing
"".match_all "foo" matcher=regex . should_equal []
"".match "" matcher=regex . should_equal ""
"".match_all "" matcher=regex . should_equal [""]
"".match "" matcher=regex mode=Matching_Mode.Last . should_equal ""
abc = 'A\u{301}ßC'
abc.match "" matcher=regex . should_equal abc
abc.match_all "" matcher=regex . should_equal ["", "", "", "", ""]
abc.match "" matcher=regex mode=Matching_Mode.Last . should_equal ""
Test.specify "should handle overlapping matches as shown in the examples" <| Test.specify "should handle overlapping matches as shown in the examples" <|
"aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (1.up_to 3) "aaa") "aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (1.up_to 3) "aaa")
@ -1256,6 +1249,12 @@ spec =
txt.find "^m..a..z.a$" . text 0 . should_equal "maza건반zaa" txt.find "^m..a..z.a$" . text 0 . should_equal "maza건반zaa"
txt.find "a..z" . text 0 . should_equal "a건반z" txt.find "a..z" . text 0 . should_equal "a건반z"
Test.specify "`find` with an empty pattern should be an error" <|
'b'.find '' . should_fail_with Illegal_Argument
Test.specify "`find_all` with an empty pattern should be an error" <|
'b'.find_all '' . should_fail_with Illegal_Argument
Test.specify "should be possible in case-insensitive mode" <| Test.specify "should be possible in case-insensitive mode" <|
"MY".find "my" Case_Sensitivity.Insensitive . text 0 . should_equal "MY" "MY".find "my" Case_Sensitivity.Insensitive . text 0 . should_equal "MY"
@ -1281,20 +1280,20 @@ spec =
expose normalization methods to allow developers to do it expose normalization methods to allow developers to do it
themselves. themselves.
accents = 'a\u{301}e\u{301}o\u{301}' accents = 'a\u{301}e\u{301}o\u{301}'
accents.find accent_1 . grapheme_span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}') accents.find accent_1 . span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
Test.specify "can return a vector of all match groups" <| Test.specify "can return a vector of all match groups" <|
"abc".find "ab((c)|(d))" . groups . should_equal ['abc', 'c', 'c', Nothing] "abc".find "ab((c)|(d))" . groups . should_equal ['abc', 'c', 'c', Nothing]
Test.specify "should default to group 0 in .span and .grapheme_span" <| Test.specify "should default to group 0 in .span and .span" <|
"abacadae".find "a[bc]" . span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae") "abacadae".find "a[bc]" . utf_16_span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae")
'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . grapheme_span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}') 'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
Test.specify "should allow to match one or more occurrences of a pattern in the text" <| Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
"abacadae".find_all "a[bc]" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"] "abacadae".find_all "a[bc]" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
"abacadae".find_all "a." . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"] "abacadae".find_all "a." . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
"abacadae".find_all "a.*" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"] "abacadae".find_all "a.*" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"]
"abacadae".find_all "a.+?" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"] "abacadae".find_all "a.+?" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
Test.specify "should allow access to match groups by number" <| Test.specify "should allow access to match groups by number" <|
"abcddd".find "ab(c(d+))" . text 0 . should_equal "abcddd" "abcddd".find "ab(c(d+))" . text 0 . should_equal "abcddd"
@ -1331,8 +1330,13 @@ spec =
Test.specify "should expand a partial-grapheme match to the whole grapheme" <| Test.specify "should expand a partial-grapheme match to the whole grapheme" <|
'e\u{301}'.find '\u{301}' . text 0 . should_equal 'e\u{301}' 'e\u{301}'.find '\u{301}' . text 0 . should_equal 'e\u{301}'
Test.specify "should not allow non-default locale" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.find 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
'a'.find_all 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
Test.group "Text.match" <| Test.group "Text.match" <|
Test.specify "should default to regex" <| Test.specify "should work correctly" <|
"My Text: Goes Here".match "^My Text: (.+)$" . should_be_true "My Text: Goes Here".match "^My Text: (.+)$" . should_be_true
"555-801-1923".match "^\d{3}-\d{3}-\d{4}$" . should_be_true "555-801-1923".match "^\d{3}-\d{3}-\d{4}$" . should_be_true
"Hello".match "^[a-z]+$" . should_be_false "Hello".match "^[a-z]+$" . should_be_false
@ -1344,12 +1348,19 @@ spec =
"abcd".match "abc" . should_be_false "abcd".match "abc" . should_be_false
"x".match "[a-z]" . should_be_true "x".match "[a-z]" . should_be_true
Test.specify "`match` with an empty pattern should be an error" <|
'b'.match '' . should_fail_with Illegal_Argument
Test.specify "should be possible on unicode text" <| Test.specify "should be possible on unicode text" <|
"Korean: 건반".match "^Korean: (.+)$" . should_be_true "Korean: 건반".match "^Korean: (.+)$" . should_be_true
Test.specify "should be possible in case-insensitive mode" <| Test.specify "should be possible in case-insensitive mode" <|
"MY".match "my" Case_Sensitivity.Insensitive . should_be_true "MY".match "my" Case_Sensitivity.Insensitive . should_be_true
Test.specify "should not allow non-default locale" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.match 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
Test.group "Regex splitting" <| Test.group "Regex splitting" <|
Test.specify "should be possible on text" <| Test.specify "should be possible on text" <|
splits = "abcde".split "[bd]" Regex_Matcher.Value splits = "abcde".split "[bd]" Regex_Matcher.Value
@ -1402,141 +1413,113 @@ spec =
Test.group "Text.replace" <| Test.group "Text.replace" <|
Test.specify "should work as in examples" <| Test.specify "should work as in examples" <|
'aaa'.replace 'aa' 'b' . should_equal 'ba' 'aaa'.replace 'aa' 'b' . should_equal 'ba'
"Hello World!".replace "[lo]" "#" matcher=Regex_Matcher.Value . should_equal "He### W#r#d!" "Hello World!".replace "[lo]" "#" use_regex=True . should_equal "He### W#r#d!"
"Hello World!".replace "l" "#" mode=Matching_Mode.First . should_equal "He#lo World!" "Hello World!".replace "l" "#" only_first=True . should_equal "He#lo World!"
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher.Value . should_equal '(abc) foo (bar) baz' '"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True . should_equal '(abc) foo (bar) baz'
'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA'
'affib'.replace 'i' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb' Test.specify "works when mapped over a vector of inputs" <|
inputs = ["axyz", "bxyz", "xabcz", "zazaz"]
inputs.map (s-> s.replace "[abc]" "q" use_regex=True) . should_equal ["qxyz", "qxyz", "xqqqz", "zqzqz"]
Test.specify "should correctly handle empty-string edge cases" <| Test.specify "should correctly handle empty-string edge cases" <|
[Regex_Mode.All, Matching_Mode.First, Matching_Mode.Last] . each mode-> [True, False] . each only_first->
'aaa'.replace '' 'foo' mode=mode . should_equal 'aaa' 'aaa'.replace '' 'foo' only_first=only_first . should_equal 'aaa'
''.replace '' '' mode=mode . should_equal '' 'a'.replace 'a' '' only_first=only_first . should_equal ''
'a'.replace 'a' '' mode=mode . should_equal '' ''.replace 'a' 'b' only_first=only_first . should_equal ''
''.replace 'a' 'b' mode=mode . should_equal ''
'aba' . replace 'a' '' Matching_Mode.First . should_equal 'ba' 'aba' . replace 'a' '' only_first=True . should_equal 'ba'
'aba' . replace 'a' '' Matching_Mode.Last . should_equal 'ab'
'aba' . replace 'a' '' . should_equal 'b' 'aba' . replace 'a' '' . should_equal 'b'
'aba' . replace 'c' '' . should_equal 'aba' 'aba' . replace 'c' '' . should_equal 'aba'
Test.specify "should correctly handle first, all and last matching with overlapping occurrences" <| Test.specify "should correctly handle first, all and last matching with overlapping occurrences" <|
"aaa aaa".replace "aa" "c" . should_equal "ca ca" "aaa aaa".replace "aa" "c" . should_equal "ca ca"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First . should_equal "ca aaa" "aaa aaa".replace "aa" "c" only_first=True . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last . should_equal "aaa ac"
Test.specify "Regex `replace` with an empty pattern should be an error" <|
'b'.replace '' 'c' use_regex=True . should_fail_with Illegal_Argument
Test.specify "should correctly handle case-insensitive matches" <| Test.specify "should correctly handle case-insensitive matches" <|
'AaąĄ' . replace "A" "-" matcher=Text_Matcher.Case_Insensitive . should_equal '--ąĄ' 'AaąĄ' . replace "A" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal '--ąĄ'
'AaąĄ' . replace "A" "-" . should_equal '-aąĄ' 'AaąĄ' . replace "A" "-" . should_equal '-aąĄ'
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Sensitive . should_equal 'HeLlO wOrLd' 'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Sensitive . should_equal 'HeLlO wOrLd'
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Insensitive . should_equal 'Hey, wOrLd' 'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'Hey, wOrLd'
"Iiİı" . replace "i" "-" . should_equal "I-İı" "Iiİı" . replace "i" "-" . should_equal "I-İı"
"Iiİı" . replace "I" "-" . should_equal "-iİı" "Iiİı" . replace "I" "-" . should_equal "-iİı"
"Iiİı" . replace "İ" "-" . should_equal "Ii-ı" "Iiİı" . replace "İ" "-" . should_equal "Ii-ı"
"Iiİı" . replace "ı" "-" . should_equal "Iiİ-" "Iiİı" . replace "ı" "-" . should_equal "Iiİ-"
"Iiİı" . replace "i" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı" "Iiİı" . replace "i" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı"
"Iiİı" . replace "I" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı" "Iiİı" . replace "I" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı"
"Iiİı" . replace "İ" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Ii-ı" "Iiİı" . replace "İ" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Ii-ı"
"Iiİı" . replace "ı" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Iiİ-" "Iiİı" . replace "ı" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Iiİ-"
tr_insensitive = Text_Matcher.Case_Insensitive (Locale.new "tr") Test.specify "should correctly handle Unicode" <|
"Iiİı" . replace "i" "-" matcher=tr_insensitive . should_equal "I--ı" 'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA'
"Iiİı" . replace "I" "-" matcher=tr_insensitive . should_equal "-iİ-" 'ß'.replace 'ß' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'A'
"Iiİı" . replace "İ" "-" matcher=tr_insensitive . should_equal "I--ı" 'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
"Iiİı" . replace "ı" "-" matcher=tr_insensitive . should_equal "-iİ-" 'affib'.replace 'ffi' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
Test.specify "should correctly handle Unicode edge cases" <|
'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}' 'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
'sśs\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'Ośs\u{301}' 'śss\u{301}' . replace 's' 'O' only_first=True . should_equal 'śOs\u{301}'
'śs\u{301}s' . replace 's' 'O' Matching_Mode.First . should_equal 'śs\u{301}O'
'sśs\u{301}' . replace 'ś' 'O' . should_equal 'sOO' 'sśs\u{301}' . replace 'ś' 'O' . should_equal 'sOO'
'śss\u{301}' . replace 'ś' 'O' only_first=True . should_equal 'Oss\u{301}'
'sśs\u{301}' . replace 's\u{301}' 'O' . should_equal 'sOO' 'sśs\u{301}' . replace 's\u{301}' 'O' . should_equal 'sOO'
's\u{301}śs' . replace 's\u{301}' 'O' . should_equal 'OOs'
'SŚS\u{301}' . replace 's' 'O' . should_equal 'SŚS\u{301}' 'SŚS\u{301}' . replace 's' 'O' . should_equal 'SŚS\u{301}'
'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'SŚS\u{301}' 'ŚS\u{301}S' . replace 's' 'O' only_first=True . should_equal 'ŚS\u{301}S'
'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First . should_equal 'ŚS\u{301}S'
'SŚS\u{301}' . replace 'ś' 'O' . should_equal 'SŚS\u{301}' 'SŚS\u{301}' . replace 'ś' 'O' . should_equal 'SŚS\u{301}'
'SŚS\u{301}' . replace 's\u{301}' 'O' . should_equal 'SŚS\u{301}' 'SŚS\u{301}' . replace 's\u{301}' 'O' . should_equal 'SŚS\u{301}'
'SŚS\u{301}' . replace 's' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}' 'SŚS\u{301}' . replace 's' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'OŚS\u{301}'
'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}' 'ŚS\u{301}S' . replace 's' 'O' only_first=True case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'ŚS\u{301}O' # 'ŚO\u{301}O' # 'ŚOS\u{301}S'
'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'ŚS\u{301}O'
'SŚS\u{301}' . replace 'ś' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO' 'SŚS\u{301}' . replace 'ś' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO'
'SŚS\u{301}' . replace 's\u{301}' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO' 'SŚS\u{301}' . replace 's\u{301}' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO'
'✨🚀🚧😍😃😍😎😙😉☺' . replace '🚧😍' '|-|:)' . should_equal '✨🚀|-|:)😃😍😎😙😉☺' '✨🚀🚧😍😃😍😎😙😉☺' . replace '🚧😍' '|-|:)' . should_equal '✨🚀|-|:)😃😍😎😙😉☺'
'Rocket Science' . replace 'Rocket' '🚀' . should_equal '🚀 Science' 'Rocket Science' . replace 'Rocket' '🚀' . should_equal '🚀 Science'
"Korean: 건반".replace "건반" "keyboard" . should_equal "Korean: keyboard" "Korean: 건반".replace "건반" "keyboard" . should_equal "Korean: keyboard"
Test.specify "will approximate ligature matches" <| Test.specify "regex and non-regex replace handle accented grapheme splitting differently" <|
# TODO do we want to improve this? highly non-trivial for very rare edge cases 'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
## Currently we lack 'resolution' to extract a partial match from 'sśs\u{301}' . replace 's' 'O' use_regex=True . should_equal 'OśO\u{301}'
the ligature to keep it, probably would need some special
mapping.
'ffiffi'.replace 'ff' 'aa' matcher=Text_Matcher.Case_Insensitive . should_equal 'aaaa'
'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'aaffi'
'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'ffiaa'
'affiffib'.replace 'IF' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb'
'aiffiffz' . replace 'if' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'a--fz'
'AFFIB'.replace 'ffi' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'A-B'
'ß'.replace 'SS' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA'
'ß'.replace 'S' 'A' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
'ß'.replace 'S' 'A' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
'STRASSE'.replace 'ß' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'STRA-E'
Test.specify "should perform simple replacement in Regex mode" <| Test.specify "should perform simple replacement in Regex mode" <|
"ababab".replace "b" "a" matcher=Regex_Matcher.Value . should_equal "aaaaaa" "ababab".replace "b" "a" use_regex=True . should_equal "aaaaaa"
"ababab".replace "b" "a" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "aaabab" "ababab".replace "b" "a" only_first=True use_regex=True . should_equal "aaabab"
"ababab".replace "b" "a" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ababaa"
"aaaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "cc" "aaaa".replace "aa" "c" use_regex=True . should_equal "cc"
"aaaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "caa" "aaaa".replace "aa" "c" only_first=True use_regex=True . should_equal "caa"
"aaaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aac"
"aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca" "aaa".replace "aa" "c" use_regex=True . should_equal "ca"
"aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca" "aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca"
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "ac"
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ca"
"aaa aaa".replace "aa" "c" matcher=Text_Matcher.Case_Sensitive . should_equal "ca ca" "aaa aaa".replace "aa" "c" case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca ca"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher.Case_Sensitive . should_equal "ca aaa" "aaa aaa".replace "aa" "c" only_first=True case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "aaa ac" "aaa aaa".replace "aa" "c" use_regex=True . should_equal "ca ca"
"aaa aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca ca" "aaa aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aaa ca"
Test.specify "in Regex mode should work with Unicode" <| Test.specify "in Regex mode should work with Unicode" <|
"Korean: 건반".replace "건반" "keyboard" matcher=Regex_Matcher.Value . should_equal "Korean: keyboard" "Korean: 건반".replace "건반" "keyboard" use_regex=True . should_equal "Korean: keyboard"
'sśs\u{301}'.replace 'ś' '-' matcher=Regex_Matcher.Value . should_equal 's--' 'sśs\u{301}'.replace 'ś' '-' use_regex=True . should_equal 's-s\u{301}'
'sśs\u{301}'.replace 's\u{301}' '-' matcher=Regex_Matcher.Value . should_equal 's--' 'sśs\u{301}'.replace 's\u{301}' '-' use_regex=True . should_equal 'sś-'
Test.specify "in Regex mode should support various Regex options" <|
r1 = "İiİ".replace "\w" "a" matcher=(Regex_Matcher.Value match_ascii=True)
r1 . should_equal "İaİ"
r2 = "abaBa".replace "b" "a" matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive)
r2 . should_equal "aaaaa"
r3 = 'ab\na'.replace "b." "a" matcher=(Regex_Matcher.Value dot_matches_newline=True)
r3 . should_equal "aaa"
text = """
Foo
bar
r4 = text.replace '\n' "" matcher=(Regex_Matcher.Value multiline=True)
r4 . should_equal "Foobar"
r5 = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" matcher=(Regex_Matcher.Value comments=True)
r5 . should_equal "aaa"
Test.specify "in Regex mode should allow referring to capture groups in substitutions" <| Test.specify "in Regex mode should allow referring to capture groups in substitutions" <|
'<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' matcher=Regex_Matcher.Value . should_equal 'content is at url' '<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' use_regex=True . should_equal 'content is at url'
'<a href="url">content</a>'.replace '<a href="(?<address>.*?)">(?<text>.*?)</a>' '${text} is at ${address}' matcher=Regex_Matcher.Value . should_equal 'content is at url' '<a href="url">content</a>'.replace '<a href="(?<address>.*?)">(?<text>.*?)</a>' '$<text> is at $<address>' use_regex=True . should_equal 'content is at url'
Test.specify "should not allow non-default locale in regex replace" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) use_regex=True . should_fail_with Illegal_Argument
Test.specify "should allow non-default locale in text replace" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_equal 'b'
main = Test_Suite.run_main spec main = Test_Suite.run_main spec