Implement Regular Expression replace and update Text.replace to the new API (#5959)

Re-implement replace on top of Truffle regex.
This commit is contained in:
GregoryTravis 2023-03-28 02:13:12 -04:00 committed by GitHub
parent 9bec3a4e71
commit 6b9cbeacb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 876 additions and 681 deletions

View File

@ -363,6 +363,8 @@
- [Aligned names of columns created by column operations.][5850]
- [Improved `cross_tab`. Renamed `fill_missing` and `is_missing` to
`fill_nothing` and `is_nothing`. Added `fill_empty`.][5863]
- [Removed many regex compile flags from `replace`; added `only_first`
flag.][5959]
[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@ -550,6 +552,7 @@
[5863]: https://github.com/enso-org/enso/pull/5863
[5917]: https://github.com/enso-org/enso/pull/5917
[5705]: https://github.com/enso-org/enso/pull/5705
[5959]: https://github.com/enso-org/enso/pull/5959
#### Enso Compiler

View File

@ -10,6 +10,7 @@ import project.Data.Range.Range
import project.Data.Text.Case.Case
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Data.Text.Encoding.Encoding
import project.Data.Text.Helpers
import project.Data.Text.Location.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex.Match.Match
@ -218,6 +219,10 @@ Text.characters self =
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
If an empty regex is used, `find` throws an Illegal_Argument error.
If a non-default locale is used, `find` throws an Illegal_Argument error.
> Example
Find the first substring matching the regex.
@ -227,10 +232,12 @@ Text.characters self =
example_find_insensitive =
## This matches `aBc` @ character 11
"aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
Regex_2.compile pattern case_insensitive=case_insensitive . match self
Helpers.regex_assume_default_locale case_sensitivity <|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.if_not_error <| compiled_pattern.match self
## Finds all the matches of the regular expression `pattern` in `self`,
returning a Vector. If not found, will be an empty Vector.
@ -240,6 +247,10 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
If an empty regex is used, `find_all` throws an Illegal_Argument error.
If a non-default locale is used, `find_all` throws an Illegal_Argument error.
> Example
Find the substring matching the regex.
@ -249,10 +260,12 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
example_find_all_insensitive =
## This matches `aABbbbc` @ character 0 and `aBC` @ character 11
"aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
Regex_2.compile pattern case_insensitive=case_insensitive . match_all self
Helpers.regex_assume_default_locale case_sensitivity <|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.if_not_error <| compiled_pattern.match_all self
## ALIAS Check Matches
@ -263,6 +276,10 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
If an empty regex is used, `match` throws an Illegal_Argument error.
If a non-default locale is used, `match` throws an Illegal_Argument error.
> Example
Checks if whole text matches a basic email regex.
@ -274,11 +291,12 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
regex = ".+ct@.+"
# Evaluates to true
"CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.matches self
Helpers.regex_assume_default_locale case_sensitivity <|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.if_not_error <| compiled_pattern.matches self
## ALIAS Split Text
@ -327,21 +345,31 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
compiled_pattern.split self mode=Regex_Mode.All
## ALIAS Replace Text
Replaces the first, last, or all occurrences of term with new_text in the
input. If `term` is empty, the function returns the input unchanged.
Perform a text or regex replace.
Returns the text with all matched elements replaced by the provided
replacement. If `input` is empty, the function returns the input unchanged.
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
Arguments:
- term: The term to find.
- new_text: The new text to replace occurrences of `term` with.
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
patterns (such as `$<n>`) for a marked group.
- mode: Specifies which occurences of term the engine tries to find. When the
mode is `First` or `Last`, this method replaces the first or last occurence
of term in the input. If set to `All`, it replaces all occurences of term in
the input.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options.
- term: The string or regex to find.
- replacement: The text to replace matches with.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
- only_first: If True, only replace the first match.
- use_regex: If true, the term is used as a regular expression.
If an empty regex is used, `replace` throws an Illegal_Argument error.
If a non-default locale is used with a regex, `replace` throws an
Illegal_Argument error.
> Example
Replace letters in the text "aaa".
@ -351,17 +379,17 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
> Example
Replace all occurrences of letters 'l' and 'o' with '#'.
"Hello World!".replace "[lo]" "#" matcher=Regex_Matcher == "He### W#r#d!"
"Hello World!".replace "[lo]" "#" use_regex=True == "He### W#r#d!"
> Example
Replace the first occurrence of letter 'l' with '#'.
"Hello World!".replace "l" "#" mode=Matching_Mode.First == "He#lo World!"
"Hello World!".replace "l" "#" only_first=True == "He#lo World!"
> Example
Replace texts in quotes with parentheses.
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher == '(abc) foo (bar) baz'
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True == '(abc) foo (bar) baz'
! Matching Grapheme Clusters
In case-insensitive mode, a single character can match multiple characters,
@ -378,62 +406,40 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
> Example
Extended partial matches in case-insensitive mode.
# The ß symbol matches the letter `S` twice in case-insensitive mode, because it folds to `ss`.
'ß'.replace 'S' 'A' matcher=(Text_Matcher Case_Insensitive) . should_equal 'AA'
# The ß symbol matches the letter `S` twice in case-insensitive mode, because it folds to `ss`.
'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA'
# The 'ffi' ligature is a single grapheme cluster, so even if just a part of it is matched, the whole grapheme is replaced.
'affib'.replace 'i' 'X' matcher=(Text_Matcher Case_Insensitive) . should_equal 'aXb'
! Last Match in Regex Mode
Regex always performs the search from the front and matching the last
occurrence means selecting the last of the matches while still generating
matches from the beginning. Regex does not return overlapping matches - it
will return a match at some position and then continue the search after that
match. This will lead to slightly different behavior for overlapping
occurrences of a pattern in Regex mode than in exact text matching mode
where the matches are searched for from the back.
'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
> Example
Comparing Matching in Last Mode in Regex and Text mode
Regexp replace.
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "ac"
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "ca"
'<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' use_regex=True == 'content is at url'
"aaa aaa".replace "aa" "c" matcher=Text_Matcher . should_equal "ca ca"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "aaa ac"
"aaa aaa".replace "aa" "c" matcher=Regex_Matcher . should_equal "ca ca"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "aaa ca"
Text.replace : Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Text
Text.replace self term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive = if term.is_empty then self else
case matcher of
_ : Text_Matcher ->
Text.replace : Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Text ! Illegal_Argument
Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False =
case use_regex of
False -> if term.is_empty then self else
array_from_single_result result = case result of
Nothing -> Array.empty
_ -> Array.new_1 result
spans_array = case matcher of
Text_Matcher.Case_Sensitive -> case mode of
Regex_Mode.All ->
Text_Utils.span_of_all self term
Matching_Mode.First ->
array_from_single_result <| Text_Utils.span_of self term
Matching_Mode.Last ->
array_from_single_result <| Text_Utils.last_span_of self term
_ -> Error.throw (Illegal_Argument.Error "Invalid mode.")
Text_Matcher.Case_Insensitive locale -> case mode of
Regex_Mode.All ->
spans_array = case case_sensitivity of
Case_Sensitivity.Sensitive -> case only_first of
False -> Text_Utils.span_of_all self term
True -> array_from_single_result <| Text_Utils.span_of self term
Case_Sensitivity.Insensitive locale -> case only_first of
False ->
Text_Utils.span_of_all_case_insensitive self term locale.java_locale
Matching_Mode.First ->
True ->
array_from_single_result <|
Text_Utils.span_of_case_insensitive self term locale.java_locale False
Matching_Mode.Last ->
array_from_single_result <|
Text_Utils.span_of_case_insensitive self term locale.java_locale True
_ -> Error.throw (Illegal_Argument.Error "Invalid mode.")
Text_Utils.replace_spans self spans_array new_text
_ : Regex_Matcher ->
compiled_pattern = matcher.compile term
compiled_pattern.replace self new_text mode=mode
Text_Utils.replace_spans self spans_array replacement
True ->
Helpers.regex_assume_default_locale case_sensitivity <|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
compiled_pattern.if_not_error <|
compiled_pattern.replace self replacement only_first
## ALIAS Get Words
@ -1115,9 +1121,9 @@ Text.trim self where=Location.Both what=_.is_whitespace =
term = "straße"
text = "MONUMENTENSTRASSE 42"
match = text . locate term matcher=(Text_Matcher Case_Insensitive)
term.length == 6
match.length == 7
match = text . locate term case_sensitivity=Case_Sensitivity.Insensitive
term.length . should_equal 6
match.length . should_equal 7
! Matching Grapheme Clusters
In case-insensitive mode, a single character can match multiple characters,
@ -1265,11 +1271,8 @@ Text.locate_all self term="" case_sensitivity=Case_Sensitivity.Sensitive = if te
- term: The term to find.
- start: The index to start searching from. If the index is negative, it
is counted from the end of the vector.
- matcher: Specifies how the term is matched against the input:
- If a `Text_Matcher`, the text is compared using case-sensitively rules
specified in the matcher.
- If a `Regex_Matcher`, the `term` is used as a regular expression and
matched using the associated options.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
@ -1301,11 +1304,8 @@ Text.index_of self term="" start=0 case_sensitivity=Case_Sensitivity.Sensitive =
- term: The term to find.
- start: The index to start searching backwards from. If the index is
negative, it is counted from the end of the vector.
- matcher: Specifies how the term is matched against the input:
- If a `Text_Matcher`, the text is compared using case-sensitively rules
specified in the matcher.
- If a `Regex_Matcher`, the `term` is used as a regular expression and
matched using the associated options.
- case_sensitivity: Specifies if the text values should be compared case
sensitively.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode

View File

@ -0,0 +1,16 @@
from Standard.Base import all
import project.Any.Any
import project.Data.Locale.Locale
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Errors.Illegal_Argument.Illegal_Argument
## PRIVATE
regex_assume_default_locale : Case_Sensitivity -> Any -> Any ! Illegal_Argument
regex_assume_default_locale case_sensitivity ~action = case case_sensitivity of
Case_Sensitivity.Sensitive -> action
Case_Sensitivity.Insensitive locale -> case locale == Locale.default of
True -> action
False ->
msg = "Custom locales are not supported for regexes."
Error.throw (Illegal_Argument.Error msg)

View File

@ -8,8 +8,8 @@ import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Common.Index_Out_Of_Bounds
import project.Error.Error
import project.Errors.Common.Index_Out_Of_Bounds
import project.Nothing.Nothing
import project.Panic.Panic
@ -32,7 +32,7 @@ type Match_2
internal_start : Integer -> Integer
internal_start self group = self.internal_regex_result.getStart group
## PRIVATE
## PRIVATE
Returns the end UTF16 character index, plus one, of a group.
This method goes directly to the internal match object. It does not
@ -48,9 +48,9 @@ type Match_2
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
utf16_start : Integer | Text -> Integer
utf16_start self group=0 =
span = self.span group
utf_16_start : Integer | Text -> Integer
utf_16_start self group=0 =
span = self.utf_16_span group
if span.is_nothing then Nothing else span.start
## Returns the end UTF16 character index, plus one, of a group.
@ -58,9 +58,9 @@ type Match_2
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
utf16_end : Integer | Text -> Integer
utf16_end self group=0 =
span = self.span group
utf_16_end : Integer | Text -> Integer
utf_16_end self group=0 =
span = self.utf_16_span group
if span.is_nothing then Nothing else span.end
## Returns the start grapheme index of a group.
@ -75,7 +75,7 @@ type Match_2
numbered starting at 1; group 0 refers to the entire match.
start : Integer | Text -> Integer
start self group=0 =
span = self.grapheme_span group
span = self.span group
if span.is_nothing then Nothing else span.start
## Returns the end grapheme index, plus one, of a group.
@ -90,7 +90,7 @@ type Match_2
numbered starting at 1; group 0 refers to the entire match.
end : Integer | Text -> Integer
end self group=0 =
span = self.grapheme_span group
span = self.span group
if span.is_nothing then Nothing else span.end
## Gets the UTF16 span matched by the group with the provided identifier, or
@ -120,9 +120,9 @@ type Match_2
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
Match_2.group will return the default value.
span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
span self group=0 ~default=Nothing =
Match_2.utf_16_span will return the default value.
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
utf_16_span self group=0 ~default=Nothing =
group_id = self.pattern.lookup_group group
start = self.internal_start group_id
end = self.internal_end group_id
@ -158,10 +158,10 @@ type Match_2
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
group 3, Match_2.group will return the default value.
grapheme_span : Integer | Text -> Any -> Span ! No_Such_Group
grapheme_span self group=0 ~default=Nothing =
result = self.span group Nothing
group 3, Match_2.span will return the default value.
span : Integer | Text -> Any -> Span ! No_Such_Group
span self group=0 ~default=Nothing =
result = self.utf_16_span group Nothing
if result.is_nothing then default else result.to_grapheme_span
## Gets the Text matched by the group with the provided identifier, or
@ -186,10 +186,10 @@ type Match_2
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
group 3, Match_2.group will return the default value.
group 3, Match_2.text will return the default value.
text : Integer | Text -> Any -> Text ! No_Such_Group
text self group=0 ~default=Nothing =
result = self.grapheme_span group Nothing
result = self.span group Nothing
if result.is_nothing then default else result.text
## Gets a vector containing the Text of _all_ of the capturing groups in
@ -208,6 +208,16 @@ type Match_2
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. `groups` will return the
default value for groups that do not participate.
> Example
Get a vector of the text matched by all of the groups in this match,
replacing the value for groups that didn't match with "UNMATCHED".
@ -237,8 +247,8 @@ type Match_2
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
Match_2.group will return the default value.
(Pattern_2.lookup_group 3) will return 3. `named_groups` will map
a named group that does not participate to the default value.
> Example
Get the map of all of the named groups in this match, replacing the
@ -261,7 +271,7 @@ type Match_2
Arguments:
- id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds.
get : Integer -> Any -> Any
get : Integer -> Any -> Text | Any
get self index ~if_missing=Nothing =
self.text index . catch No_Such_Group (_-> if_missing)
@ -272,6 +282,6 @@ type Match_2
Arguments:
- id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds.
at : Integer -> Any ! Index_Out_Of_Bounds
at : Integer -> Text ! Index_Out_Of_Bounds
at self index =
self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count))

View File

@ -6,17 +6,19 @@ import project.Data.Range.Range
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match_2.Match_2
import project.Data.Text.Regex.Replacer.Replacer
import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Meta
import project.Nothing.Nothing
import project.Panic.Panic
import project.Polyglot.Polyglot
from project.Data.Boolean import Boolean, True, False
polyglot java import org.enso.base.Replacer_Cache
polyglot java import org.enso.base.Text_Utils
type Pattern_2
@ -50,22 +52,24 @@ type Pattern_2
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Match_2` objects, each containing the matched text
Returns a `Vector Match_2` object, each containing the matched text
and its match groups.
Arguments:
- input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match_2
match_all : Text -> Vector Match_2 ! Illegal_Argument
match_all self input =
builder = Vector.new_builder
it = Match_Iterator.new self input
go it = case it.next of
Match_Iterator_Value.Next _ match next_it ->
builder.append match
go next_it
Match_Iterator_Value.Last _ -> Nothing
go it
builder.to_vector
pattern_is_empty = self.internal_regex_object.pattern == ''
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
builder = Vector.new_builder
it = Match_Iterator.new self input
go it = case it.next of
Match_Iterator_Value.Next _ match next_it ->
builder.append match
go next_it
Match_Iterator_Value.Last _ -> Nothing
go it
builder.to_vector
## Tries to match the provided `input` against the pattern `self`.
@ -89,6 +93,82 @@ type Pattern_2
find_all self input =
self.match_all input . map match_to_group_maybe
## ADVANCED
Replace all occurrences of the pattern described by `self` in the `input`
with the specified `replacement`.
Arguments:
- input: The text in which to perform the replacement(s).
- replacement: The literal text with which to replace any matches.
- only_first: If True, only replace the first match.
If this method performs no replacements it will return the `input` text
unchanged.
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
> Example
Replace letters in the text "aa".
pattern = Regex_2.compile 'aa'
pattern.replace 'aaa' 'b' == 'ba'
> Example
Replace all occurrences of letters 'l' and 'o' with '#'.
pattern = Regex_2.compile '[lo]'
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
> Example
Replace the first occurrence of letter 'l' with '#'.
pattern = Regex_2.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
> Example
Replace texts in quotes with parentheses.
pattern = Regex_2.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
> Example
Replace a literal string with a replacement value.
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match == "xyz ab xyz ac ad xyz xyz ax"
> Example
Replace each word with the same word surrounded by `[]`.
pattern = Regex_2.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
replace : Text -> Text -> Boolean -> Text
replace self input replacement only_first=False =
it = Match_Iterator.new self input
case it of
Match_Iterator_Value.Last filler -> filler.text
_ ->
replacer = Replacer.new replacement self
replacer.if_not_error <|
go next current = case next of
Match_Iterator_Value.Next filler match next_it ->
new_value = current + filler.text + (replacer.replace match)
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next new_value
Match_Iterator_Value.Last filler ->
current + filler.text
go it.next ""
## PRIVATE
Look up a match group name or number, and check that it is valid.
@ -106,6 +186,9 @@ type Pattern_2
A group name is an alias for a group number; if a name is passed to
this method, it returns the corresponding group number.
If a group number is passed to `lookup_group` and it is valid, it will
simply return the group number.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
@ -138,6 +221,15 @@ type Pattern_2
_ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name)
## PRIVATE
Return a lazy iterator over matches against a string.
Arguments
- text: the string to match against.
iterator : Text -> Match_Iterator
iterator self input = Match_Iterator.new self input
## Return the number of groups in the underlying RegexObject.
Note, the count includes group 0 (the whole match) as well.
group_count : Integer
@ -154,32 +246,51 @@ type Pattern_2
Performs the regex match, and iterates through the results. Yields both
the matched parts of the string, and the 'filler' parts between them.
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
matches and replacement boundaries can fall in the middle of multi-
character graphemes, thereby splitting them apart.
At each step, it yields a Match_Iterator_Value, whivch has either a filler
and a match, or just the final filler. A Match_Iterator_Value.Last value is
return at the end, and only at the end.
Optionally, you can call `early_exit` to have it return the remainder of
the string, unmatched, as a single Last value. (Used for `replace` with
`only_first=True`.)
type Match_Iterator
new : Pattern_2 -> Text -> Match_Iterator
new pattern input = Match_Iterator.Value pattern input 0
Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
## Return the next match, or the last filler string if there is no
additional match.
Also returns the next iterator, if there was a match.
next : Match_Iterator_Value
next self =
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
case regex_result.isMatch of
False ->
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
filler_span = (Utf_16_Span.Value filler_range self.input)
Match_Iterator_Value.Last filler_span
True ->
match_start = regex_result.getStart 0
filler_range = Range.new self.cursor match_start
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
filler_span = (Utf_16_Span.Value filler_range self.input)
match = Match_2.Value self.pattern regex_result self.input
next_cursor = match.utf16_end 0
next_cursor = match.utf_16_end 0
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
Match_Iterator_Value.Next filler_span match next_iterator
## Returns the remainder of the string, unmatched.
early_exit : Match_Iterator_Value
early_exit self =
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = Utf_16_Span.Value filler_range self.input
Match_Iterator_Value.Last filler_span
to_text_debug : Vector Text
to_text_debug self =
vb = Vector.new_builder

View File

@ -0,0 +1,144 @@
import project.Data.Numbers.Integer
import project.Data.Text.Extensions
import project.Data.Text.Regex.Match_2.Match_2
import project.Data.Text.Regex.Pattern_2.Match_Iterator_Value
import project.Data.Text.Regex.Pattern_2.Pattern_2
import project.Data.Text.Regex_2
import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Illegal_State.Illegal_State
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
polyglot java import java.lang.StringBuilder
polyglot java import org.enso.base.Replacer_Cache
type Replacer
## PRIVATE
Implements a replacement for a regular expression.
Pattern_2.replace uses a Replacer to replace each regex match with
a replacement string. This string can contain references to match
groups from the original regex.
The `new` smart constructor parses a Text into a vector of
Replacements. Each Replacement is either a literal string or a
group number. To provide a replacement for a regex match, the
Replacer iterates through the Replacement vector, substitutes
the match group contents for each group number, and concatenates
all the strings together to form the full replacement string.
Value (replacement : Vector Replacement)
## Creates a new Replacer.
Arguments
- replacement_string: a string, possibly containing group references,
that will be used to provide a replacement in a regex match.
new : Text -> Pattern_2 -> Replacer ! No_Such_Group
new replacement_string pattern =
Replacer.Value (build_replacement_vector_cached replacement_string pattern)
## Build a replacement string from a match.
Arguments:
- match: the match from the original string that is to be replaced.
replace : Match_2 -> Text
replace self match =
string_builder = StringBuilder.new
self.replacement.each replacement->
s = case replacement of
Replacement.Literal text -> text
Replacement.Substitution group_number -> match.text group_number
string_builder.append s
string_builder.toString
## PRIVATE
Get the size of the Replacer LRU cache. For testing.
get_lru_size : Integer
get_lru_size = Replacer_Cache.getLruSize
## PRIVATE
Look up a replacement string in the Replacer LRU cache. For testing.
replacer_cache_lookup : Text -> Replacer | Nothing
replacer_cache_lookup replacement_string = Replacer_Cache.get replacement_string
## PRIVATE
group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))"
## PRIVATE
Build a replacement vector.
Parse the replacement string into an alternating series of literal
strings and group reference numbers.
Uses Replacement_Cache to avoid rebuilding the vector for recently used
replacement strings.
build_replacement_vector_cached : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
build_replacement_vector_cached replacement_string pattern =
Replacer_Cache.get_or_set replacement_string _->
build_replacement_vector replacement_string pattern
## PRIVATE
Build a replacement vector.
Parse the replacement string into an alternating series of literal
strings and group reference numbers.
build_replacement_vector : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
build_replacement_vector replacement_string pattern =
replacement_pattern = Regex_2.compile group_reference_regex
it = replacement_pattern.iterator replacement_string
builder = Vector.new_builder
go it = case it.next of
Match_Iterator_Value.Next filler match next_it ->
replacement = parse_group_number pattern match
replacement.if_not_error <|
builder.append (Replacement.Literal filler.text)
builder.append replacement
@Tail_Call go next_it
Match_Iterator_Value.Last filler ->
builder.append (Replacement.Literal filler.text)
result = go it
result.if_not_error <|
builder.to_vector
## PRIVATE
Parse a capture group reference.
Arguments:
- pattern: the Pattern_2 used to initiate the replacement. This is used
to identify and validate capture groups.
- match: the match of the replacement string against group_reference_regex.
Returns a Replacement: a group number, or, in the case of `$$`, a literal.
See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
parse_group_number : Pattern_2 -> Match_2 -> Replacement ! No_Such_Group
parse_group_number pattern match = case match.text.take 2 of
"$$" -> Replacement.Literal "$"
"$<" ->
# Group 6 contains the group name without the `<>`.
group_name = match.text 6
Replacement.Substitution (pattern.lookup_group group_name)
"$&" -> Replacement.Substitution 0
_ ->
n = Integer.parse <| match.text 2
Replacement.Substitution (pattern.lookup_group n)
type Replacement
## A string literal to replace with.
Literal (text : Text)
## Target group to insert.
Substitution (group_number : Integer)

View File

@ -7,6 +7,7 @@ import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
from project.Errors.Common import Syntax_Error
@ -17,24 +18,27 @@ polyglot java import java.util.regex.Pattern as Java_Pattern
Arguments
- expression: The text representing the regular expression that you want to
compile.
compile. Must be non-empty.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
If an empty regex is used, `compile` throws an Illegal_Argument error.
? Why Compile?
While many regex engines are able to cache ad-hoc patterns, it is often
useful to be able to manually retain a pattern that you have computed. This
function exists so you can hold onto the resultant `Pattern_2` object,
instead of immediately proceeding to match using it.
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error | Illegal_Argument
compile self expression case_insensitive=Nothing =
options_string = if case_insensitive == True then "usgi" else "usg"
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
options_string = if case_insensitive == True then "usgi" else "usg"
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
Pattern_2.Value internal_regex_object
Pattern_2.Value internal_regex_object
## ADVANCED

View File

@ -843,8 +843,8 @@ type Table
parse_problem_builder.attach_problems_before on_problems <|
Table.new new_columns
## Replaces the first, last, or all occurrences of `term` with
`new_text` in each text row of selected columns.
## Replaces the first, or all occurrences of `term` with `new_text` in each
text row of selected columns.
If `term` is empty, the function returns the table unchanged.
This method follows the exact replacement semantics of the
@ -854,15 +854,13 @@ type Table
- columns: Column selection criteria or a column name or index.
- term: The term to find.
- new_text: The new text to replace occurrences of `term` with.
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
patterns (such as `$<n>`) for a marked group.
- mode: Specifies which occurences of term the engine tries to find. When the
mode is `First` or `Last`, this method replaces the first or last occurence
of term in each individual table cell. If set to `All`, it replaces all
occurences of term.
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
regular expression and matched using the associated options.
If use_regex is true, `new_text` can include replacement patterns
(such as `$<n>`) for a marked group.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
- only_first: If True, only replace the first match.
- use_regex: If true, the term is used as a regular expression.
- on_problems: Specifies how to handle if a problem occurs, raising as a
warning by default.
@ -881,21 +879,21 @@ type Table
> Example
Remove leading and trailing spaces from cells in multiple columns.
table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" matcher=Regex_Matcher.Value
table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" use_regex=True
> Example
Replace texts in quotes with parentheses in column at index 1.
table.replace_text 1 '"(.*?)"' '($1)' matcher=Regex_Matcher.Value
replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Problem_Behavior -> Table
replace_text self columns=[0] term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
table.replace_text 1 '"(.*?)"' '($1)' use_regex=True
replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Problem_Behavior -> Table
replace_text self columns=[0] term="" new_text="" case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
problem_builder = Problem_Builder.new
selection = self.columns_helper.select_columns_helper columns reorder=False problem_builder
selected_names = Map.from_vector (selection.map column-> [column.name, True])
map_preserve_name column f = column.map f . rename column.name
do_replace = _.replace term new_text mode matcher
do_replace = _.replace term new_text case_sensitivity=case_sensitivity only_first=only_first use_regex=use_regex
do_replace_only_text = case _ of
item : Text -> do_replace item
item -> item

View File

@ -18,7 +18,7 @@ type Naming_Helpers
sanitize_name : Text -> Text
sanitize_name name =
# Using the regex matcher due to the #5831 bug.
name.replace '\0' '\\\\0' matcher=Regex_Matcher.Value
name.replace '\0' '\\0' use_regex=True
## PRIVATE
Generates a column name for a binary operation.

View File

@ -0,0 +1,51 @@
package org.enso.base;
import org.graalvm.collections.Pair;
import org.graalvm.polyglot.Value;
import java.util.ArrayList;
import java.util.function.Function;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Replacer_Cache {
private static final int lruSize = 5;
// Circular buffer containing the most recent cache keys.
private static final List<Pair<String, Value>> lru = new ArrayList<>(lruSize);
static {
for (int i = 0; i < lruSize; ++i) {
lru.add(null);
}
}
// Index into the circular buffer.
private static int nextSlot = 0;
public static Value get_or_set(String key, Function<Void, Value> value_producer) {
Value value = get(key);
if (value == null) {
value = value_producer.apply(null);
lru.set(nextSlot, Pair.create(key, value));
nextSlot = (nextSlot + 1) % lruSize;
}
return value;
}
// Visible for testing.
public static Value get(String key) {
for (int i = 0; i < lruSize; ++i) {
Pair<String, Value> pair = lru.get(i);
if (pair != null && pair.getLeft().equals(key)) {
return lru.get(i).getRight();
}
}
return null;
}
public static int getLruSize() {
return lruSize;
}
}

View File

@ -644,7 +644,7 @@ spec =
bools = ["bools", [False, False, True, True]]
texts = ["texts", ["foo", "bar", "baz", "spam"]]
table = Table.new [bools, texts]
actual = table.replace_text "texts" "(a|o)" "$1e" matcher=Regex_Matcher.Value
actual = table.replace_text "texts" "(a|o)" "$1e" use_regex=True
actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"]
Problems.assume_no_problems actual

View File

@ -1,199 +1,98 @@
from Standard.Base import all
import Standard.Base.Data.Text.Span.Span
import Standard.Base.Data.Text.Span.Utf_16_Span
import Standard.Base.Data.Text.Regex_2
import Standard.Base.Data.Text.Regex.Match_2.Match_2
import Standard.Base.Data.Text.Regex.Pattern_2.Pattern_2
import Standard.Base.Data.Text.Regex.Replacer.Replacer
import Standard.Base.Data.Text.Regex_2
import Standard.Base.Data.Text.Regex_2.No_Such_Group
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions
# default_mask = Java_Pattern.CANON_EQ.bit_or Java_Pattern.UNICODE_CASE . bit_or Java_Pattern.UNICODE_CHARACTER_CLASS
polyglot java import org.enso.base.Replacer_Cache
spec =
##
Test.group "The default regex engine's options handling" <|
Test.group "Compile" <|
Test.specify "should be able to be compiled" <|
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
pattern . should_be_a Pattern_2
Test.specify "should convert options to Java" <|
options = [Regex_Option.Comments, Regex_Option.Multiline, Default_Engine.Option.Unix_Lines]
expected_mask = Java_Pattern.UNIX_LINES.bit_or Java_Pattern.COMMENTS . bit_or Java_Pattern.MULTILINE . bit_or default_mask
actual_mask = Default_Engine.from_enso_options options
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
Regex_2.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
actual_mask . should_equal expected_mask
Test.specify "should disallow empty patterns in `compile`" <|
Regex_2.compile "" . should_fail_with Illegal_Argument
Test.specify "should specify the unicode options by default" <|
actual_mask = Default_Engine.from_enso_options []
Test.group "Escape" <|
Test.specify "should escape an expression for use as a literal" <|
pattern = "http://example.com"
Regex_2.escape pattern . should_equal "\Qhttp://example.com\E"
actual_mask . should_equal default_mask
Test.group "Pattern.matches" <|
Test.specify "should return True when the pattern matches against the input" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_true
Test.specify "should handle ascii matching by disabling unicode" <|
actual_mask = Default_Engine.from_enso_options [Regex_Option.Ascii_Matching]
actual_mask . should_equal 0
Test.specify "should return False when the pattern doesn't match against the input" <|
pattern = Regex_2.compile "aaz"
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_false
Test.specify "should result in an error when an option is invalid" <|
Default_Engine.from_enso_options [""] . should_fail_with Invalid_Option
Default_Engine.from_enso_options ["", Regex_Option.Ascii_Matching] . should_fail_with Invalid_Option
Test.specify "should check for full matches" <|
pattern = Regex_2.compile "f.o"
pattern.matches "foo" . should_be_true
pattern.matches "foobar" . should_be_false
Test.group "The default regex engine (Default_Engine)" <|
Test.specify "`matches` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.matches "ABC" . should_fail_with Illegal_Argument
Test.specify "should be able to compile patterns with no options" <|
engine = Default_Engine.new
pattern = engine.compile "^a$" []
pattern.engine . should_equal engine
pattern.options . should_equal []
pattern.internal_pattern.flags . should_equal default_mask
Test.group "Pattern.match" <|
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
match.text 0 . should_equal input
Test.specify "should be able to compile patterns with global options" <|
engine = Default_Engine.new
pattern = engine.compile "^a$" [Regex_Option.Multiline]
pattern.engine . should_equal engine
pattern.options . should_equal [Regex_Option.Multiline]
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.MULTILINE)
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "abc"
match = pattern.match input
match . should_equal Nothing
Test.specify "should be able to compile patterns with engine-specific options" <|
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
pattern = engine.compile "^a$" []
pattern.engine . should_equal engine
pattern.options . should_equal [Default_Engine.Option.Literal_Pattern]
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.LITERAL)
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
pattern = Regex_2.compile "(..)"
input = "abcdefghij"
matches = pattern.match_all input
matches.length . should_equal 5
matches.at 0 . text 0 . should_equal "ab"
matches.at 1 . text 0 . should_equal "cd"
matches.at 2 . text 0 . should_equal "ef"
matches.at 3 . text 0 . should_equal "gh"
matches.at 4 . text 0 . should_equal "ij"
Test.specify "should be able to compile patterns with combined options" <|
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
pattern = engine.compile "^a$" [Regex_Option.Comments]
pattern.engine . should_equal engine
pattern.options.contains Default_Engine.Option.Literal_Pattern . should_be_true
pattern.options.contains Regex_Option.Comments . should_be_true
pattern.internal_pattern.flags . should_equal (default_mask . bit_or Java_Pattern.LITERAL . bit_or Java_Pattern.COMMENTS)
Test.specify "should return `[]` when an all match match fails" <|
pattern = Regex_2.compile "(aa)"
input = "abcdefghij"
match = pattern.match_all input
match . should_equal []
Test.specify "should return a syntax error of the regex syntax is invalid" <|
engine = Default_Engine.new
engine.compile "^(a" [] . should_fail_with Syntax_Error
Test.specify "`match` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.match "ABC" . should_fail_with Illegal_Argument
Test.specify "should throw an invalid options error if an option is invalid" <|
engine = Default_Engine.new
engine.compile "^a$" ["invalid"] . should_fail_with Invalid_Option
Test.specify "`match_all` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.match_all "ABC" . should_fail_with Illegal_Argument
Test.specify "should escape an expression for use as a literal" <|
pattern = "http://example.com"
engine = Default_Engine.new
engine.escape pattern . should_equal "\Qhttp://example.com\E"
Test.group "The default regex engine's Pattern.matches" <|
engine = Default_Engine.new
Test.specify "should return True when the pattern matches against the input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_true
Test.specify "should return False when the pattern doesn't match against the input" <|
pattern = engine.compile "aaz" []
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_false
Test.specify "should check for full matches" <|
pattern = engine.compile "f.o" []
pattern.matches "foo" . should_be_true
pattern.matches "foobar" . should_be_false
Test.group "The default regex engine's Pattern.match" <|
engine = Default_Engine.new
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
match.text 0 . should_equal input
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "abc"
match = pattern.match input mode=Matching_Mode.First
match . should_equal Nothing
Test.specify "should be able to `match` at most N instances of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=3
match.length . should_equal 3
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
Test.specify "should `match` fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "(..)" []
input = "abcdef"
match = pattern.match input mode=5
match.length . should_equal 3
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
Test.specify "should return `Nothing` when a counted match fails" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.match input mode=3
match . should_equal Nothing
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=Regex_Mode.All
match.length . should_equal 5
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
match.at 3 . group 0 . should_equal "gh"
match.at 4 . group 0 . should_equal "ij"
Test.specify "should return `Nothing` when an all match match fails" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.match input mode=Regex_Mode.All
match . should_equal Nothing
Test.specify "should be able to `match` the pattern against the entire input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Regex_Mode.Full
match . should_be_a Default_Engine.Match.Value
match.text 0 . should_equal input
Test.specify "should return `Nothing` if a full match does not match the entire input" <|
pattern = engine.compile "(..)" []
input = "aa ab"
full_match = pattern.match input mode=Regex_Mode.Full
full_match . should_equal Nothing
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.specify "should be able to `match` the pattern against bounded input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=(Regex_Mode.Bounded 2 8)
match.length . should_equal 3
match.at 0 . text 0 . should_equal "cd"
match.at 1 . text 0 . should_equal "ef"
match.at 2 . text 0 . should_equal "gh"
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
pattern = engine.compile "" []
match_1 = pattern.match "" mode=Regex_Mode.All
match_1.length . should_equal 1
match_1.at 0 . start 0 . should_equal 0
match_1.at 0 . end 0 . should_equal 0
match_2 = pattern.match "ABC" mode=Regex_Mode.All
match_2.length . should_equal 4
match_2.at 0 . start 0 . should_equal 0
match_2.at 0 . end 0 . should_equal 0
match_2.at 1 . start 0 . should_equal 1
match_2.at 1 . end 0 . should_equal 1
match_2.at 3 . start 0 . should_equal 3
match_2.at 3 . end 0 . should_equal 3
Test.group "The default regex engine's Pattern.find" <|
Test.group "Pattern_2.find and .find_all" <|
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
pattern = Regex_2.compile "(..)"
input = "abcdefghij"
@ -229,6 +128,14 @@ spec =
Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"]
Test.specify "`find` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.find "ABC" . should_fail_with Illegal_Argument
Test.specify "`find_all` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.find_all "ABC" . should_fail_with Illegal_Argument
##
Test.group "The default regex engine's Pattern.split" <|
engine = Default_Engine.new
@ -279,152 +186,132 @@ spec =
match.at 3 . should_equal "e"
match.at 4 . should_equal "f"
Test.group "The default regex engine's Pattern.replace" <|
engine = Default_Engine.new
Test.group "Pattern_2.replace" <|
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
pattern = Regex_2.compile "abc"
input = "aa ab abc a bc abc"
match = pattern.replace input "REPLACED" only_first=True
match . should_be_a Text
match . should_equal "aa ab REPLACED a bc abc"
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
pattern = engine.compile "abc" []
input = "aa ab abc a bc abc"
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
match . should_be_a Text
match . should_equal "aa ab REPLACED a bc abc"
Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <|
pattern = Regex_2.compile "xyz"
input = "aa ab ac ad"
match = pattern.replace input "REPLACED" only_first=True
match . should_equal input
Test.specify "should return the string unchanged if there are no matches to replace in first mode" <|
pattern = engine.compile "xyz" []
input = "aa ab ac ad"
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
match . should_equal input
Test.specify "should be able to replace the all instances of the pattern in the input" <|
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should be able to replace at most N instances of the pattern in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=3
match . should_equal "REPLACED ab REPLACED ac ad REPLACED aa ax"
Test.specify "should return the input when an all replace fails" <|
pattern = Regex_2.compile "aa"
input = "abcdefghij"
match = pattern.replace input "REPLACED"
match . should_equal input
Test.specify "should replace fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=10
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when a counted replace fails" <|
pattern = engine.compile "aa" []
input = "abcdefghij"
match = pattern.replace input "REPLACED" mode=3
match . should_equal input
Test.specify "should be able to replace the all instances of the pattern in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when an all replace fails" <|
pattern = engine.compile "aa" []
input = "abcdefghij"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
match . should_equal input
Test.specify "should be able to replace the entire input only if it matches" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
match . should_equal "REPLACED"
Test.specify "should correctly replace entire input in Full mode even if partial matches are possible" <|
pattern = engine.compile "(aa)+" []
pattern.replace "aaa" "REPLACED" mode=Regex_Mode.Full . should_equal "aaa"
pattern.replace "aaaa" "REPLACED" mode=Regex_Mode.Full . should_equal "REPLACED"
Test.specify "should return the input for a full replace if the pattern doesn't match the entire input" <|
pattern = engine.compile "(..)" []
input = "aa ab"
full_match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
full_match . should_equal input
Test.specify "should not perform overlapping replacements in counted mode" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
result = pattern.replace input "REPLACED" mode=3
result . should_equal "REPLACEDREPLACEDREPLACEDghij"
Test.specify "should not perform overlapping replacements in all mode" <|
pattern = engine.compile "(..)" []
input = "aa ab"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
match . should_equal "REPLACEDREPLACEDb"
Test.specify "should handle capture groups in replacement" <|
pattern = engine.compile "(?<capture>[a-z]+)" []
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=0 . should_equal "foo bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=1 . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=2 . should_equal "[foo] [bar], baz"
pattern.replace "foo bar, baz" "[$1]" mode=3 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=4 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=0 . should_equal "foo bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=1 . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=2 . should_equal "[foo] [bar], baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=3 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=4 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
Test.specify "should handle capture groups in replacement in All mode" <|
pattern = engine.compile "([a-z]+)" []
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.Full . should_equal "foo bar, baz"
pattern.replace "foo" "[$1]" mode=Regex_Mode.Full . should_equal "[foo]"
pattern_2 = engine.compile '<a href="(?<addr>.*?)">(?<name>.*?)</a>' []
pattern_2.replace '<a href="url">content</a>' "$2 <- $1" mode=Regex_Mode.Full . should_equal "content <- url"
pattern_2.replace '<a href="url">content</a>' "${name} <- ${addr}" mode=Regex_Mode.Full . should_equal "content <- url"
Test.group "Match.group" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
Test.specify "should be able to replace the entire input only if it matches" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
match = pattern.replace input "REPLACED"
match . should_equal "REPLACED"
Test.specify "should return the full match with index 0" <|
match.group 0 . should_equal "aa ab abc a bc bcd"
Test.specify "should not perform overlapping replacements in all mode" <|
pattern = Regex_2.compile "(..)"
input = "aa ab"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACEDREPLACEDb"
Test.specify "should return the group contents if it matches by index" <|
match.group 1 . should_equal "aa ab "
Test.specify "should handle capture groups in replacement" <|
pattern = Regex_2.compile "(?<capture>[a-z]+)"
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz"
Test.specify "should return the group contents if it matches by name" <|
match.group "letters" . should_equal "abc a bc bcd"
pattern.replace "foo bar, baz" "[$<capture>]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$<capture>]" only_first=True . should_equal "[foo] bar, baz"
Test.specify "should return Nothing if the group did not match" <|
match.group 3 . should_equal Nothing
pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz"
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
match.group "fail" . should_fail_with No_Such_Group
match.group 5 . should_fail_with No_Such_Group
Test.specify "should handle unicode in capture group names" <|
pattern = Regex_2.compile "(?<건반>[a-z]+)"
pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]"
Test.specify "should make named groups accessible by index" <|
match.group 2 . should_equal (match.group "letters")
Text.group "should correctly evaluate documentation examples" <|
Test.specify "example 1" <|
pattern = Regex_2.compile 'aa'
pattern.replace 'aaa' 'b' . should_equal 'ba'
Test.group "Match.groups" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.specify "example 2" <|
pattern = Regex_2.compile '[lo]'
pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!'
Test.specify "should return the results of all groups" <|
groups = match.groups
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
Test.specify "example 3" <|
pattern = Regex_2.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!'
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.groups "UNMATCHED"
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
Test.specify "example 4" <|
pattern = Regex_2.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz'
Test.specify "example 5" <|
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match . should_equal "xyz ab xyz ac ad xyz xyz ax"
Test.specify "example 6" <|
pattern = Regex_2.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
Test.specify "`replace` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.replace "ABC" . should_fail_with Illegal_Argument
Test.group "Match.text" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the full match with index 0" <|
match.text 0 . should_equal "aa ab abc a bc bcd"
Test.specify "should return the group contents if it matches by index" <|
match.text 1 . should_equal "aa ab "
Test.specify "should return the group contents if it matches by name" <|
match.text "letters" . should_equal "abc a bc bcd"
Test.specify "should return Nothing if the group did not match" <|
match.text 3 . should_equal Nothing
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
match.text "fail" . should_fail_with No_Such_Group
match.text 5 . should_fail_with No_Such_Group
Test.specify "should make named groups accessible by index" <|
match.text 2 . should_equal (match.text "letters")
Test.group "Match.groups" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the results of all groups" <|
groups = match.groups
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.groups "UNMATCHED"
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
Test.group "Match.named_groups" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
@ -445,147 +332,135 @@ spec =
groups.at "letters" . should_equal "abc a bc bcd"
groups.at "empty" . should_equal "UNMATCHED"
Test.group "Match.start" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.group "Match.start" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the start of a group by index" <|
match.start 1 . should_equal 0
Test.specify "should return the start of a group by index" <|
match.start 1 . should_equal 0
Test.specify "should return the start of a group by name" <|
match.start "letters" . should_equal 6
Test.specify "should return the start of a group by name" <|
match.start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <|
match.start 3 . should_equal Nothing
match.start "empty" . should_equal Nothing
Test.specify "should return Nothing if the group didn't match" <|
match.start 3 . should_equal Nothing
match.start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.start 5 . should_fail_with No_Such_Group
match.start "nonexistent" . should_fail_with No_Such_Group
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.start 5 . should_fail_with No_Such_Group
match.start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.end" <|
Test.group "Match.end" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the end of a group by index" <|
match.end 1 . should_equal 6
Test.specify "should return the end of a group by name" <|
match.end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <|
match.end 3 . should_equal Nothing
match.end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.end 5 . should_fail_with No_Such_Group
match.end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_start" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the start of a group by index" <|
match.utf_16_start 1 . should_equal 0
Test.specify "should return the start of a group by name" <|
match.utf_16_start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_start 3 . should_equal Nothing
match.utf_16_start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_start 5 . should_fail_with No_Such_Group
match.utf_16_start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_end" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the end of a group by index" <|
match.end 1 . should_equal 6
match.utf_16_end 1 . should_equal 6
Test.specify "should return the end of a group by name" <|
match.end "letters" . should_equal 18
match.utf_16_end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <|
match.end 3 . should_equal Nothing
match.end "empty" . should_equal Nothing
match.utf_16_end 3 . should_equal Nothing
match.utf_16_end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.end 5 . should_fail_with No_Such_Group
match.end "nonexistent" . should_fail_with No_Such_Group
match.utf_16_end 5 . should_fail_with No_Such_Group
match.utf_16_end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf16_start" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.group "Match.span" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the start of a group by index" <|
match.utf16_start 1 . should_equal 0
Test.specify "should get the span of a group by index" <|
match.span 1 . should_equal (Span.Value (0.up_to 6) input)
Test.specify "should return the start of a group by name" <|
match.utf16_start "letters" . should_equal 6
Test.specify "should get the span of a group by name" <|
match.span "letters" . should_equal (Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.utf16_start 3 . should_equal Nothing
match.utf16_start "empty" . should_equal Nothing
Test.specify "should return Nothing if the group didn't match" <|
match.span 3 . should_equal Nothing
match.span "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf16_start 5 . should_fail_with No_Such_Group
match.utf16_start "nonexistent" . should_fail_with No_Such_Group
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.span 5 . should_fail_with No_Such_Group
match.span "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf16_end" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.group "Match.utf_16_span" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the end of a group by index" <|
match.utf16_end 1 . should_equal 6
Test.specify "should get the UTF16 span of a group by index" <|
match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
Test.specify "should return the end of a group by name" <|
match.utf16_end "letters" . should_equal 18
Test.specify "should get the UTF16 span of a group by name" <|
match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.utf16_end 3 . should_equal Nothing
match.utf16_end "empty" . should_equal Nothing
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_span 3 . should_equal Nothing
match.utf_16_span "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf16_end 5 . should_fail_with No_Such_Group
match.utf16_end "nonexistent" . should_fail_with No_Such_Group
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_span 5 . should_fail_with No_Such_Group
match.utf_16_span "nonexistent" . should_fail_with No_Such_Group
##
Test.group "Match.span" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.group "caching" <|
Test.specify "Replacer cache drops old values" <|
pattern = Regex_2.compile('([a-c])')
Test.specify "should get the span of a group by index" <|
match.span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
Test.specify "should get the span of a group by name" <|
match.span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.span 3 . should_equal Nothing
match.span "empty" . should_equal Nothing
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.span 5 . should_fail_with No_Such_Group
match.span "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.start_position" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the region start over which self match was performed" <|
match.start_position . should_equal 0
Test.group "Match.end_position" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the region end over which self match was performed" <|
match.end_position . should_equal 18
Test.group "Regex options handling" <|
Test.specify "should work properly with flag options" <|
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[]
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
Test.specify "should properly override vector options" <|
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[Regex_Option.Multiline, Regex_Option.Case_Insensitive]
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Case_Insensitive, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
Test.group "Regexes" <|
Test.specify "should be able to be compiled" <|
pattern = Regex.compile "(?<dots>..)" case_insensitive=True
pattern . should_be_a Default_Engine.Pattern.Value
pattern.options . should_equal [Regex_Option.Case_Insensitive]
Test.specify "should be able to be escaped" <|
pattern = "http://example.com"
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
## TODO: Missing tests for No_Such_Group_Error
# Add enough values to flush out the first values.
0.up_to get_lru_size+1 . map i->
result = pattern.replace "abcdef" ("$1$1x" + i.to_text)
result . should_not_equal Nothing
replacer_cache_lookup "$1$1x0" . should_equal Nothing
replacer_cache_lookup "$1$1x1" . should_not_equal Nothing
main = Test_Suite.run_main spec

View File

@ -7,6 +7,7 @@ import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Common.Incomparable_Values
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.IO
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
@ -15,6 +16,7 @@ from Standard.Base.Data.Index_Sub_Range.Index_Sub_Range import all
from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions
import Standard.Base.Data.Text.Extensions
type Auto
Value a
@ -1190,9 +1192,9 @@ spec =
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
Test.specify "find should produce correct spans" <|
"Hello World!".find ".o" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!")
"Hello World!".find_all ".o" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
"foobar".find "BAR" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 6) "foobar")
"Hello World!".find ".o" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!")
"Hello World!".find_all ".o" . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
"foobar".find "BAR" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 6) "foobar")
Test.specify "should handle accents and other multi-point graphemes" <|
accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
@ -1201,29 +1203,20 @@ spec =
accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}'
# Check both UTF16 spans
accents.find_all 'h' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents]
accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents]
accents.find_all 'h' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents]
accents.find_all 'e\u{301}' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents]
# Check both grapheme spans
accents.find_all 'h' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
accents.find_all 'e\u{301}' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
accents.find_all 'h' . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
# Check contents to make sure the spans' ranges are ok
accents.find 'h' . text 0 . should_equal 'h'
accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}'
Test.specify "should correctly handle regex edge cases in locate" pending="Figure out how to make Regex correctly handle empty patterns." <|
regex = Regex_Matcher.Value
"".match "foo" matcher=regex . should_equal Nothing
"".match "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing
"".match_all "foo" matcher=regex . should_equal []
"".match "" matcher=regex . should_equal ""
"".match_all "" matcher=regex . should_equal [""]
"".match "" matcher=regex mode=Matching_Mode.Last . should_equal ""
abc = 'A\u{301}ßC'
abc.match "" matcher=regex . should_equal abc
abc.match_all "" matcher=regex . should_equal ["", "", "", "", ""]
abc.match "" matcher=regex mode=Matching_Mode.Last . should_equal ""
Test.specify "should correctly handle regex edge cases in `find`" <|
"".find "foo" . should_equal Nothing
"".find_all "foo" . should_equal []
Test.specify "should handle overlapping matches as shown in the examples" <|
"aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (1.up_to 3) "aaa")
@ -1256,6 +1249,12 @@ spec =
txt.find "^m..a..z.a$" . text 0 . should_equal "maza건반zaa"
txt.find "a..z" . text 0 . should_equal "a건반z"
Test.specify "`find` with an empty pattern should be an error" <|
'b'.find '' . should_fail_with Illegal_Argument
Test.specify "`find_all` with an empty pattern should be an error" <|
'b'.find_all '' . should_fail_with Illegal_Argument
Test.specify "should be possible in case-insensitive mode" <|
"MY".find "my" Case_Sensitivity.Insensitive . text 0 . should_equal "MY"
@ -1281,20 +1280,20 @@ spec =
expose normalization methods to allow developers to do it
themselves.
accents = 'a\u{301}e\u{301}o\u{301}'
accents.find accent_1 . grapheme_span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
accents.find accent_1 . span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
Test.specify "can return a vector of all match groups" <|
"abc".find "ab((c)|(d))" . groups . should_equal ['abc', 'c', 'c', Nothing]
Test.specify "should default to group 0 in .span and .grapheme_span" <|
"abacadae".find "a[bc]" . span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae")
'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . grapheme_span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
Test.specify "should default to group 0 in .span and .span" <|
"abacadae".find "a[bc]" . utf_16_span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae")
'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
"abacadae".find_all "a[bc]" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
"abacadae".find_all "a." . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
"abacadae".find_all "a.*" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"]
"abacadae".find_all "a.+?" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
"abacadae".find_all "a[bc]" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
"abacadae".find_all "a." . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
"abacadae".find_all "a.*" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"]
"abacadae".find_all "a.+?" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
Test.specify "should allow access to match groups by number" <|
"abcddd".find "ab(c(d+))" . text 0 . should_equal "abcddd"
@ -1331,8 +1330,13 @@ spec =
Test.specify "should expand a partial-grapheme match to the whole grapheme" <|
'e\u{301}'.find '\u{301}' . text 0 . should_equal 'e\u{301}'
Test.specify "should not allow non-default locale" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.find 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
'a'.find_all 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
Test.group "Text.match" <|
Test.specify "should default to regex" <|
Test.specify "should work correctly" <|
"My Text: Goes Here".match "^My Text: (.+)$" . should_be_true
"555-801-1923".match "^\d{3}-\d{3}-\d{4}$" . should_be_true
"Hello".match "^[a-z]+$" . should_be_false
@ -1344,12 +1348,19 @@ spec =
"abcd".match "abc" . should_be_false
"x".match "[a-z]" . should_be_true
Test.specify "`match` with an empty pattern should be an error" <|
'b'.match '' . should_fail_with Illegal_Argument
Test.specify "should be possible on unicode text" <|
"Korean: 건반".match "^Korean: (.+)$" . should_be_true
Test.specify "should be possible in case-insensitive mode" <|
"MY".match "my" Case_Sensitivity.Insensitive . should_be_true
Test.specify "should not allow non-default locale" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.match 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
Test.group "Regex splitting" <|
Test.specify "should be possible on text" <|
splits = "abcde".split "[bd]" Regex_Matcher.Value
@ -1402,141 +1413,113 @@ spec =
Test.group "Text.replace" <|
Test.specify "should work as in examples" <|
'aaa'.replace 'aa' 'b' . should_equal 'ba'
"Hello World!".replace "[lo]" "#" matcher=Regex_Matcher.Value . should_equal "He### W#r#d!"
"Hello World!".replace "l" "#" mode=Matching_Mode.First . should_equal "He#lo World!"
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher.Value . should_equal '(abc) foo (bar) baz'
'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA'
'affib'.replace 'i' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb'
"Hello World!".replace "[lo]" "#" use_regex=True . should_equal "He### W#r#d!"
"Hello World!".replace "l" "#" only_first=True . should_equal "He#lo World!"
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True . should_equal '(abc) foo (bar) baz'
Test.specify "works when mapped over a vector of inputs" <|
inputs = ["axyz", "bxyz", "xabcz", "zazaz"]
inputs.map (s-> s.replace "[abc]" "q" use_regex=True) . should_equal ["qxyz", "qxyz", "xqqqz", "zqzqz"]
Test.specify "should correctly handle empty-string edge cases" <|
[Regex_Mode.All, Matching_Mode.First, Matching_Mode.Last] . each mode->
'aaa'.replace '' 'foo' mode=mode . should_equal 'aaa'
''.replace '' '' mode=mode . should_equal ''
'a'.replace 'a' '' mode=mode . should_equal ''
''.replace 'a' 'b' mode=mode . should_equal ''
[True, False] . each only_first->
'aaa'.replace '' 'foo' only_first=only_first . should_equal 'aaa'
'a'.replace 'a' '' only_first=only_first . should_equal ''
''.replace 'a' 'b' only_first=only_first . should_equal ''
'aba' . replace 'a' '' Matching_Mode.First . should_equal 'ba'
'aba' . replace 'a' '' Matching_Mode.Last . should_equal 'ab'
'aba' . replace 'a' '' only_first=True . should_equal 'ba'
'aba' . replace 'a' '' . should_equal 'b'
'aba' . replace 'c' '' . should_equal 'aba'
Test.specify "should correctly handle first, all and last matching with overlapping occurrences" <|
"aaa aaa".replace "aa" "c" . should_equal "ca ca"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last . should_equal "aaa ac"
"aaa aaa".replace "aa" "c" only_first=True . should_equal "ca aaa"
Test.specify "Regex `replace` with an empty pattern should be an error" <|
'b'.replace '' 'c' use_regex=True . should_fail_with Illegal_Argument
Test.specify "should correctly handle case-insensitive matches" <|
'AaąĄ' . replace "A" "-" matcher=Text_Matcher.Case_Insensitive . should_equal '--ąĄ'
'AaąĄ' . replace "A" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal '--ąĄ'
'AaąĄ' . replace "A" "-" . should_equal '-aąĄ'
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Sensitive . should_equal 'HeLlO wOrLd'
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Insensitive . should_equal 'Hey, wOrLd'
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Sensitive . should_equal 'HeLlO wOrLd'
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'Hey, wOrLd'
"Iiİı" . replace "i" "-" . should_equal "I-İı"
"Iiİı" . replace "I" "-" . should_equal "-iİı"
"Iiİı" . replace "İ" "-" . should_equal "Ii-ı"
"Iiİı" . replace "ı" "-" . should_equal "Iiİ-"
"Iiİı" . replace "i" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı"
"Iiİı" . replace "I" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı"
"Iiİı" . replace "İ" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Ii-ı"
"Iiİı" . replace "ı" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Iiİ-"
"Iiİı" . replace "i" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı"
"Iiİı" . replace "I" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı"
"Iiİı" . replace "İ" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Ii-ı"
"Iiİı" . replace "ı" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Iiİ-"
tr_insensitive = Text_Matcher.Case_Insensitive (Locale.new "tr")
"Iiİı" . replace "i" "-" matcher=tr_insensitive . should_equal "I--ı"
"Iiİı" . replace "I" "-" matcher=tr_insensitive . should_equal "-iİ-"
"Iiİı" . replace "İ" "-" matcher=tr_insensitive . should_equal "I--ı"
"Iiİı" . replace "ı" "-" matcher=tr_insensitive . should_equal "-iİ-"
Test.specify "should correctly handle Unicode" <|
'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA'
'ß'.replace 'ß' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'A'
'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
'affib'.replace 'ffi' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
Test.specify "should correctly handle Unicode edge cases" <|
'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
'sśs\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'Ośs\u{301}'
'śs\u{301}s' . replace 's' 'O' Matching_Mode.First . should_equal 'śs\u{301}O'
'śss\u{301}' . replace 's' 'O' only_first=True . should_equal 'śOs\u{301}'
'sśs\u{301}' . replace 'ś' 'O' . should_equal 'sOO'
'śss\u{301}' . replace 'ś' 'O' only_first=True . should_equal 'Oss\u{301}'
'sśs\u{301}' . replace 's\u{301}' 'O' . should_equal 'sOO'
's\u{301}śs' . replace 's\u{301}' 'O' . should_equal 'OOs'
'SŚS\u{301}' . replace 's' 'O' . should_equal 'SŚS\u{301}'
'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'SŚS\u{301}'
'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First . should_equal 'ŚS\u{301}S'
'ŚS\u{301}S' . replace 's' 'O' only_first=True . should_equal 'ŚS\u{301}S'
'SŚS\u{301}' . replace 'ś' 'O' . should_equal 'SŚS\u{301}'
'SŚS\u{301}' . replace 's\u{301}' 'O' . should_equal 'SŚS\u{301}'
'SŚS\u{301}' . replace 's' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}'
'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}'
'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'ŚS\u{301}O'
'SŚS\u{301}' . replace 's' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'OŚS\u{301}'
'ŚS\u{301}S' . replace 's' 'O' only_first=True case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'ŚS\u{301}O' # 'ŚO\u{301}O' # 'ŚOS\u{301}S'
'SŚS\u{301}' . replace 'ś' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO'
'SŚS\u{301}' . replace 's\u{301}' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO'
'SŚS\u{301}' . replace 'ś' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO'
'SŚS\u{301}' . replace 's\u{301}' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO'
'✨🚀🚧😍😃😍😎😙😉☺' . replace '🚧😍' '|-|:)' . should_equal '✨🚀|-|:)😃😍😎😙😉☺'
'Rocket Science' . replace 'Rocket' '🚀' . should_equal '🚀 Science'
"Korean: 건반".replace "건반" "keyboard" . should_equal "Korean: keyboard"
Test.specify "will approximate ligature matches" <|
# TODO do we want to improve this? highly non-trivial for very rare edge cases
## Currently we lack 'resolution' to extract a partial match from
the ligature to keep it, probably would need some special
mapping.
'ffiffi'.replace 'ff' 'aa' matcher=Text_Matcher.Case_Insensitive . should_equal 'aaaa'
'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'aaffi'
'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'ffiaa'
'affiffib'.replace 'IF' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb'
'aiffiffz' . replace 'if' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'a--fz'
'AFFIB'.replace 'ffi' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'A-B'
'ß'.replace 'SS' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA'
'ß'.replace 'S' 'A' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
'ß'.replace 'S' 'A' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
'STRASSE'.replace 'ß' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'STRA-E'
Test.specify "regex and non-regex replace handle accented grapheme splitting differently" <|
'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
'sśs\u{301}' . replace 's' 'O' use_regex=True . should_equal 'OśO\u{301}'
Test.specify "should perform simple replacement in Regex mode" <|
"ababab".replace "b" "a" matcher=Regex_Matcher.Value . should_equal "aaaaaa"
"ababab".replace "b" "a" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "aaabab"
"ababab".replace "b" "a" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ababaa"
"ababab".replace "b" "a" use_regex=True . should_equal "aaaaaa"
"ababab".replace "b" "a" only_first=True use_regex=True . should_equal "aaabab"
"aaaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "cc"
"aaaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "caa"
"aaaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aac"
"aaaa".replace "aa" "c" use_regex=True . should_equal "cc"
"aaaa".replace "aa" "c" only_first=True use_regex=True . should_equal "caa"
"aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca"
"aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca"
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "ac"
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ca"
"aaa".replace "aa" "c" use_regex=True . should_equal "ca"
"aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca"
"aaa aaa".replace "aa" "c" matcher=Text_Matcher.Case_Sensitive . should_equal "ca ca"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher.Case_Sensitive . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "aaa ac"
"aaa aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca ca"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aaa ca"
"aaa aaa".replace "aa" "c" case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca ca"
"aaa aaa".replace "aa" "c" only_first=True case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca aaa"
"aaa aaa".replace "aa" "c" use_regex=True . should_equal "ca ca"
"aaa aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca aaa"
Test.specify "in Regex mode should work with Unicode" <|
"Korean: 건반".replace "건반" "keyboard" matcher=Regex_Matcher.Value . should_equal "Korean: keyboard"
'sśs\u{301}'.replace 'ś' '-' matcher=Regex_Matcher.Value . should_equal 's--'
'sśs\u{301}'.replace 's\u{301}' '-' matcher=Regex_Matcher.Value . should_equal 's--'
Test.specify "in Regex mode should support various Regex options" <|
r1 = "İiİ".replace "\w" "a" matcher=(Regex_Matcher.Value match_ascii=True)
r1 . should_equal "İaİ"
r2 = "abaBa".replace "b" "a" matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive)
r2 . should_equal "aaaaa"
r3 = 'ab\na'.replace "b." "a" matcher=(Regex_Matcher.Value dot_matches_newline=True)
r3 . should_equal "aaa"
text = """
Foo
bar
r4 = text.replace '\n' "" matcher=(Regex_Matcher.Value multiline=True)
r4 . should_equal "Foobar"
r5 = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" matcher=(Regex_Matcher.Value comments=True)
r5 . should_equal "aaa"
"Korean: 건반".replace "건반" "keyboard" use_regex=True . should_equal "Korean: keyboard"
'sśs\u{301}'.replace 'ś' '-' use_regex=True . should_equal 's-s\u{301}'
'sśs\u{301}'.replace 's\u{301}' '-' use_regex=True . should_equal 'sś-'
Test.specify "in Regex mode should allow referring to capture groups in substitutions" <|
'<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' matcher=Regex_Matcher.Value . should_equal 'content is at url'
'<a href="url">content</a>'.replace '<a href="(?<address>.*?)">(?<text>.*?)</a>' '${text} is at ${address}' matcher=Regex_Matcher.Value . should_equal 'content is at url'
'<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' use_regex=True . should_equal 'content is at url'
'<a href="url">content</a>'.replace '<a href="(?<address>.*?)">(?<text>.*?)</a>' '$<text> is at $<address>' use_regex=True . should_equal 'content is at url'
Test.specify "should not allow non-default locale in regex replace" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) use_regex=True . should_fail_with Illegal_Argument
Test.specify "should allow non-default locale in text replace" <|
locale = Locale.new "en" "GB" "UTF-8"
'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_equal 'b'
main = Test_Suite.run_main spec