mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 13:02:07 +03:00
Implement Regular Expression replace and update Text.replace
to the new API (#5959)
Re-implement replace on top of Truffle regex.
This commit is contained in:
parent
9bec3a4e71
commit
6b9cbeacb2
@ -363,6 +363,8 @@
|
||||
- [Aligned names of columns created by column operations.][5850]
|
||||
- [Improved `cross_tab`. Renamed `fill_missing` and `is_missing` to
|
||||
`fill_nothing` and `is_nothing`. Added `fill_empty`.][5863]
|
||||
- [Removed many regex compile flags from `replace`; added `only_first`
|
||||
flag.][5959]
|
||||
|
||||
[debug-shortcuts]:
|
||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||
@ -550,6 +552,7 @@
|
||||
[5863]: https://github.com/enso-org/enso/pull/5863
|
||||
[5917]: https://github.com/enso-org/enso/pull/5917
|
||||
[5705]: https://github.com/enso-org/enso/pull/5705
|
||||
[5959]: https://github.com/enso-org/enso/pull/5959
|
||||
|
||||
#### Enso Compiler
|
||||
|
||||
|
@ -10,6 +10,7 @@ import project.Data.Range.Range
|
||||
import project.Data.Text.Case.Case
|
||||
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Text.Helpers
|
||||
import project.Data.Text.Location.Location
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
@ -218,6 +219,10 @@ Text.characters self =
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
|
||||
If an empty regex is used, `find` throws an Illegal_Argument error.
|
||||
|
||||
If a non-default locale is used, `find` throws an Illegal_Argument error.
|
||||
|
||||
> Example
|
||||
Find the first substring matching the regex.
|
||||
|
||||
@ -227,10 +232,12 @@ Text.characters self =
|
||||
example_find_insensitive =
|
||||
## This matches `aBc` @ character 11
|
||||
"aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
|
||||
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error
|
||||
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
Regex_2.compile pattern case_insensitive=case_insensitive . match self
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.if_not_error <| compiled_pattern.match self
|
||||
|
||||
## Finds all the matches of the regular expression `pattern` in `self`,
|
||||
returning a Vector. If not found, will be an empty Vector.
|
||||
@ -240,6 +247,10 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
|
||||
If an empty regex is used, `find_all` throws an Illegal_Argument error.
|
||||
|
||||
If a non-default locale is used, `find_all` throws an Illegal_Argument error.
|
||||
|
||||
> Example
|
||||
Find the substring matching the regex.
|
||||
|
||||
@ -249,10 +260,12 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
example_find_all_insensitive =
|
||||
## This matches `aABbbbc` @ character 0 and `aBC` @ character 11
|
||||
"aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
|
||||
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error
|
||||
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
Regex_2.compile pattern case_insensitive=case_insensitive . match_all self
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.if_not_error <| compiled_pattern.match_all self
|
||||
|
||||
## ALIAS Check Matches
|
||||
|
||||
@ -263,6 +276,10 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
|
||||
If an empty regex is used, `match` throws an Illegal_Argument error.
|
||||
|
||||
If a non-default locale is used, `match` throws an Illegal_Argument error.
|
||||
|
||||
> Example
|
||||
Checks if whole text matches a basic email regex.
|
||||
|
||||
@ -274,11 +291,12 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
regex = ".+ct@.+"
|
||||
# Evaluates to true
|
||||
"CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
|
||||
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error
|
||||
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.matches self
|
||||
compiled_pattern.if_not_error <| compiled_pattern.matches self
|
||||
|
||||
## ALIAS Split Text
|
||||
|
||||
@ -327,21 +345,31 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
|
||||
compiled_pattern.split self mode=Regex_Mode.All
|
||||
|
||||
## ALIAS Replace Text
|
||||
Replaces the first, last, or all occurrences of term with new_text in the
|
||||
input. If `term` is empty, the function returns the input unchanged.
|
||||
Perform a text or regex replace.
|
||||
|
||||
Returns the text with all matched elements replaced by the provided
|
||||
replacement. If `input` is empty, the function returns the input unchanged.
|
||||
|
||||
The replacement string can contain references to groups matched by the
|
||||
regex. The following syntaxes are supported:
|
||||
$0: the entire match string
|
||||
$&: the entire match string
|
||||
$n: the nth group
|
||||
$<foo>: Named group `foo`
|
||||
|
||||
Arguments:
|
||||
- term: The term to find.
|
||||
- new_text: The new text to replace occurrences of `term` with.
|
||||
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
|
||||
patterns (such as `$<n>`) for a marked group.
|
||||
- mode: Specifies which occurences of term the engine tries to find. When the
|
||||
mode is `First` or `Last`, this method replaces the first or last occurence
|
||||
of term in the input. If set to `All`, it replaces all occurences of term in
|
||||
the input.
|
||||
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
|
||||
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
|
||||
regular expression and matched using the associated options.
|
||||
- term: The string or regex to find.
|
||||
- replacement: The text to replace matches with.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
- only_first: If True, only replace the first match.
|
||||
- use_regex: If true, the term is used as a regular expression.
|
||||
|
||||
If an empty regex is used, `replace` throws an Illegal_Argument error.
|
||||
|
||||
If a non-default locale is used with a regex, `replace` throws an
|
||||
Illegal_Argument error.
|
||||
|
||||
> Example
|
||||
Replace letters in the text "aaa".
|
||||
@ -351,17 +379,17 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
|
||||
> Example
|
||||
Replace all occurrences of letters 'l' and 'o' with '#'.
|
||||
|
||||
"Hello World!".replace "[lo]" "#" matcher=Regex_Matcher == "He### W#r#d!"
|
||||
"Hello World!".replace "[lo]" "#" use_regex=True == "He### W#r#d!"
|
||||
|
||||
> Example
|
||||
Replace the first occurrence of letter 'l' with '#'.
|
||||
|
||||
"Hello World!".replace "l" "#" mode=Matching_Mode.First == "He#lo World!"
|
||||
"Hello World!".replace "l" "#" only_first=True == "He#lo World!"
|
||||
|
||||
> Example
|
||||
Replace texts in quotes with parentheses.
|
||||
|
||||
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher == '(abc) foo (bar) baz'
|
||||
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True == '(abc) foo (bar) baz'
|
||||
|
||||
! Matching Grapheme Clusters
|
||||
In case-insensitive mode, a single character can match multiple characters,
|
||||
@ -379,61 +407,39 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
|
||||
Extended partial matches in case-insensitive mode.
|
||||
|
||||
# The ß symbol matches the letter `S` twice in case-insensitive mode, because it folds to `ss`.
|
||||
'ß'.replace 'S' 'A' matcher=(Text_Matcher Case_Insensitive) . should_equal 'AA'
|
||||
'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA'
|
||||
# The 'ffi' ligature is a single grapheme cluster, so even if just a part of it is matched, the whole grapheme is replaced.
|
||||
'affib'.replace 'i' 'X' matcher=(Text_Matcher Case_Insensitive) . should_equal 'aXb'
|
||||
|
||||
! Last Match in Regex Mode
|
||||
Regex always performs the search from the front and matching the last
|
||||
occurrence means selecting the last of the matches while still generating
|
||||
matches from the beginning. Regex does not return overlapping matches - it
|
||||
will return a match at some position and then continue the search after that
|
||||
match. This will lead to slightly different behavior for overlapping
|
||||
occurrences of a pattern in Regex mode than in exact text matching mode
|
||||
where the matches are searched for from the back.
|
||||
'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
|
||||
|
||||
> Example
|
||||
Comparing Matching in Last Mode in Regex and Text mode
|
||||
Regexp replace.
|
||||
|
||||
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "ac"
|
||||
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "ca"
|
||||
'<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' use_regex=True == 'content is at url'
|
||||
|
||||
"aaa aaa".replace "aa" "c" matcher=Text_Matcher . should_equal "ca ca"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher . should_equal "ca aaa"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "aaa ac"
|
||||
"aaa aaa".replace "aa" "c" matcher=Regex_Matcher . should_equal "ca ca"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher . should_equal "ca aaa"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "aaa ca"
|
||||
Text.replace : Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Text
|
||||
Text.replace self term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive = if term.is_empty then self else
|
||||
case matcher of
|
||||
_ : Text_Matcher ->
|
||||
Text.replace : Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Text ! Illegal_Argument
|
||||
Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False =
|
||||
case use_regex of
|
||||
False -> if term.is_empty then self else
|
||||
array_from_single_result result = case result of
|
||||
Nothing -> Array.empty
|
||||
_ -> Array.new_1 result
|
||||
spans_array = case matcher of
|
||||
Text_Matcher.Case_Sensitive -> case mode of
|
||||
Regex_Mode.All ->
|
||||
Text_Utils.span_of_all self term
|
||||
Matching_Mode.First ->
|
||||
array_from_single_result <| Text_Utils.span_of self term
|
||||
Matching_Mode.Last ->
|
||||
array_from_single_result <| Text_Utils.last_span_of self term
|
||||
_ -> Error.throw (Illegal_Argument.Error "Invalid mode.")
|
||||
Text_Matcher.Case_Insensitive locale -> case mode of
|
||||
Regex_Mode.All ->
|
||||
spans_array = case case_sensitivity of
|
||||
Case_Sensitivity.Sensitive -> case only_first of
|
||||
False -> Text_Utils.span_of_all self term
|
||||
True -> array_from_single_result <| Text_Utils.span_of self term
|
||||
Case_Sensitivity.Insensitive locale -> case only_first of
|
||||
False ->
|
||||
Text_Utils.span_of_all_case_insensitive self term locale.java_locale
|
||||
Matching_Mode.First ->
|
||||
True ->
|
||||
array_from_single_result <|
|
||||
Text_Utils.span_of_case_insensitive self term locale.java_locale False
|
||||
Matching_Mode.Last ->
|
||||
array_from_single_result <|
|
||||
Text_Utils.span_of_case_insensitive self term locale.java_locale True
|
||||
_ -> Error.throw (Illegal_Argument.Error "Invalid mode.")
|
||||
Text_Utils.replace_spans self spans_array new_text
|
||||
_ : Regex_Matcher ->
|
||||
compiled_pattern = matcher.compile term
|
||||
compiled_pattern.replace self new_text mode=mode
|
||||
Text_Utils.replace_spans self spans_array replacement
|
||||
True ->
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
|
||||
compiled_pattern.if_not_error <|
|
||||
compiled_pattern.replace self replacement only_first
|
||||
|
||||
## ALIAS Get Words
|
||||
|
||||
@ -1115,9 +1121,9 @@ Text.trim self where=Location.Both what=_.is_whitespace =
|
||||
|
||||
term = "straße"
|
||||
text = "MONUMENTENSTRASSE 42"
|
||||
match = text . locate term matcher=(Text_Matcher Case_Insensitive)
|
||||
term.length == 6
|
||||
match.length == 7
|
||||
match = text . locate term case_sensitivity=Case_Sensitivity.Insensitive
|
||||
term.length . should_equal 6
|
||||
match.length . should_equal 7
|
||||
|
||||
! Matching Grapheme Clusters
|
||||
In case-insensitive mode, a single character can match multiple characters,
|
||||
@ -1265,11 +1271,8 @@ Text.locate_all self term="" case_sensitivity=Case_Sensitivity.Sensitive = if te
|
||||
- term: The term to find.
|
||||
- start: The index to start searching from. If the index is negative, it
|
||||
is counted from the end of the vector.
|
||||
- matcher: Specifies how the term is matched against the input:
|
||||
- If a `Text_Matcher`, the text is compared using case-sensitively rules
|
||||
specified in the matcher.
|
||||
- If a `Regex_Matcher`, the `term` is used as a regular expression and
|
||||
matched using the associated options.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
@ -1301,11 +1304,8 @@ Text.index_of self term="" start=0 case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
- term: The term to find.
|
||||
- start: The index to start searching backwards from. If the index is
|
||||
negative, it is counted from the end of the vector.
|
||||
- matcher: Specifies how the term is matched against the input:
|
||||
- If a `Text_Matcher`, the text is compared using case-sensitively rules
|
||||
specified in the matcher.
|
||||
- If a `Regex_Matcher`, the `term` is used as a regular expression and
|
||||
matched using the associated options.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
|
@ -0,0 +1,16 @@
|
||||
from Standard.Base import all
|
||||
|
||||
import project.Any.Any
|
||||
import project.Data.Locale.Locale
|
||||
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
## PRIVATE
|
||||
regex_assume_default_locale : Case_Sensitivity -> Any -> Any ! Illegal_Argument
|
||||
regex_assume_default_locale case_sensitivity ~action = case case_sensitivity of
|
||||
Case_Sensitivity.Sensitive -> action
|
||||
Case_Sensitivity.Insensitive locale -> case locale == Locale.default of
|
||||
True -> action
|
||||
False ->
|
||||
msg = "Custom locales are not supported for regexes."
|
||||
Error.throw (Illegal_Argument.Error msg)
|
@ -8,8 +8,8 @@ import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Common.Index_Out_Of_Bounds
|
||||
import project.Error.Error
|
||||
import project.Errors.Common.Index_Out_Of_Bounds
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
@ -48,9 +48,9 @@ type Match_2
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
utf16_start : Integer | Text -> Integer
|
||||
utf16_start self group=0 =
|
||||
span = self.span group
|
||||
utf_16_start : Integer | Text -> Integer
|
||||
utf_16_start self group=0 =
|
||||
span = self.utf_16_span group
|
||||
if span.is_nothing then Nothing else span.start
|
||||
|
||||
## Returns the end UTF16 character index, plus one, of a group.
|
||||
@ -58,9 +58,9 @@ type Match_2
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
utf16_end : Integer | Text -> Integer
|
||||
utf16_end self group=0 =
|
||||
span = self.span group
|
||||
utf_16_end : Integer | Text -> Integer
|
||||
utf_16_end self group=0 =
|
||||
span = self.utf_16_span group
|
||||
if span.is_nothing then Nothing else span.end
|
||||
|
||||
## Returns the start grapheme index of a group.
|
||||
@ -75,7 +75,7 @@ type Match_2
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
start : Integer | Text -> Integer
|
||||
start self group=0 =
|
||||
span = self.grapheme_span group
|
||||
span = self.span group
|
||||
if span.is_nothing then Nothing else span.start
|
||||
|
||||
## Returns the end grapheme index, plus one, of a group.
|
||||
@ -90,7 +90,7 @@ type Match_2
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
end : Integer | Text -> Integer
|
||||
end self group=0 =
|
||||
span = self.grapheme_span group
|
||||
span = self.span group
|
||||
if span.is_nothing then Nothing else span.end
|
||||
|
||||
## Gets the UTF16 span matched by the group with the provided identifier, or
|
||||
@ -120,9 +120,9 @@ type Match_2
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match_2.group will return the default value.
|
||||
span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
|
||||
span self group=0 ~default=Nothing =
|
||||
Match_2.utf_16_span will return the default value.
|
||||
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
|
||||
utf_16_span self group=0 ~default=Nothing =
|
||||
group_id = self.pattern.lookup_group group
|
||||
start = self.internal_start group_id
|
||||
end = self.internal_end group_id
|
||||
@ -158,10 +158,10 @@ type Match_2
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match_2.group will return the default value.
|
||||
grapheme_span : Integer | Text -> Any -> Span ! No_Such_Group
|
||||
grapheme_span self group=0 ~default=Nothing =
|
||||
result = self.span group Nothing
|
||||
group 3, Match_2.span will return the default value.
|
||||
span : Integer | Text -> Any -> Span ! No_Such_Group
|
||||
span self group=0 ~default=Nothing =
|
||||
result = self.utf_16_span group Nothing
|
||||
if result.is_nothing then default else result.to_grapheme_span
|
||||
|
||||
## Gets the Text matched by the group with the provided identifier, or
|
||||
@ -186,10 +186,10 @@ type Match_2
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match_2.group will return the default value.
|
||||
group 3, Match_2.text will return the default value.
|
||||
text : Integer | Text -> Any -> Text ! No_Such_Group
|
||||
text self group=0 ~default=Nothing =
|
||||
result = self.grapheme_span group Nothing
|
||||
result = self.span group Nothing
|
||||
if result.is_nothing then default else result.text
|
||||
|
||||
## Gets a vector containing the Text of _all_ of the capturing groups in
|
||||
@ -208,6 +208,16 @@ type Match_2
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. `groups` will return the
|
||||
default value for groups that do not participate.
|
||||
|
||||
> Example
|
||||
Get a vector of the text matched by all of the groups in this match,
|
||||
replacing the value for groups that didn't match with "UNMATCHED".
|
||||
@ -237,8 +247,8 @@ type Match_2
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match_2.group will return the default value.
|
||||
(Pattern_2.lookup_group 3) will return 3. `named_groups` will map
|
||||
a named group that does not participate to the default value.
|
||||
|
||||
> Example
|
||||
Get the map of all of the named groups in this match, replacing the
|
||||
@ -261,7 +271,7 @@ type Match_2
|
||||
Arguments:
|
||||
- id: The integer index or name of that group.
|
||||
- if_missing: The value to return if the index is out of bounds.
|
||||
get : Integer -> Any -> Any
|
||||
get : Integer -> Any -> Text | Any
|
||||
get self index ~if_missing=Nothing =
|
||||
self.text index . catch No_Such_Group (_-> if_missing)
|
||||
|
||||
@ -272,6 +282,6 @@ type Match_2
|
||||
Arguments:
|
||||
- id: The integer index or name of that group.
|
||||
- if_missing: The value to return if the index is out of bounds.
|
||||
at : Integer -> Any ! Index_Out_Of_Bounds
|
||||
at : Integer -> Text ! Index_Out_Of_Bounds
|
||||
at self index =
|
||||
self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count))
|
||||
|
@ -6,17 +6,19 @@ import project.Data.Range.Range
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Regex.Match_2.Match_2
|
||||
import project.Data.Text.Regex.Replacer.Replacer
|
||||
import project.Data.Text.Regex_2.No_Such_Group
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
import project.Polyglot.Polyglot
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
|
||||
type Pattern_2
|
||||
@ -50,13 +52,15 @@ type Pattern_2
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Match_2` objects, each containing the matched text
|
||||
Returns a `Vector Match_2` object, each containing the matched text
|
||||
and its match groups.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match_all : Text -> Vector Match_2
|
||||
match_all : Text -> Vector Match_2 ! Illegal_Argument
|
||||
match_all self input =
|
||||
pattern_is_empty = self.internal_regex_object.pattern == ''
|
||||
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go it = case it.next of
|
||||
@ -89,6 +93,82 @@ type Pattern_2
|
||||
find_all self input =
|
||||
self.match_all input . map match_to_group_maybe
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Replace all occurrences of the pattern described by `self` in the `input`
|
||||
with the specified `replacement`.
|
||||
|
||||
Arguments:
|
||||
- input: The text in which to perform the replacement(s).
|
||||
- replacement: The literal text with which to replace any matches.
|
||||
- only_first: If True, only replace the first match.
|
||||
|
||||
If this method performs no replacements it will return the `input` text
|
||||
unchanged.
|
||||
|
||||
The replacement string can contain references to groups matched by the
|
||||
regex. The following syntaxes are supported:
|
||||
$0: the entire match string
|
||||
$&: the entire match string
|
||||
$n: the nth group
|
||||
$<foo>: Named group `foo`
|
||||
|
||||
> Example
|
||||
Replace letters in the text "aa".
|
||||
|
||||
pattern = Regex_2.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' == 'ba'
|
||||
|
||||
> Example
|
||||
Replace all occurrences of letters 'l' and 'o' with '#'.
|
||||
|
||||
pattern = Regex_2.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
|
||||
|
||||
> Example
|
||||
Replace the first occurrence of letter 'l' with '#'.
|
||||
|
||||
pattern = Regex_2.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
|
||||
|
||||
> Example
|
||||
Replace texts in quotes with parentheses.
|
||||
|
||||
pattern = Regex_2.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
|
||||
|
||||
> Example
|
||||
Replace a literal string with a replacement value.
|
||||
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match == "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
> Example
|
||||
Replace each word with the same word surrounded by `[]`.
|
||||
|
||||
pattern = Regex_2.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
|
||||
|
||||
replace : Text -> Text -> Boolean -> Text
|
||||
replace self input replacement only_first=False =
|
||||
it = Match_Iterator.new self input
|
||||
case it of
|
||||
Match_Iterator_Value.Last filler -> filler.text
|
||||
_ ->
|
||||
replacer = Replacer.new replacement self
|
||||
|
||||
replacer.if_not_error <|
|
||||
go next current = case next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
new_value = current + filler.text + (replacer.replace match)
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next new_value
|
||||
Match_Iterator_Value.Last filler ->
|
||||
current + filler.text
|
||||
go it.next ""
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Look up a match group name or number, and check that it is valid.
|
||||
@ -106,6 +186,9 @@ type Pattern_2
|
||||
A group name is an alias for a group number; if a name is passed to
|
||||
this method, it returns the corresponding group number.
|
||||
|
||||
If a group number is passed to `lookup_group` and it is valid, it will
|
||||
simply return the group number.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
@ -138,6 +221,15 @@ type Pattern_2
|
||||
_ : Integer -> n
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Return a lazy iterator over matches against a string.
|
||||
|
||||
Arguments
|
||||
- text: the string to match against.
|
||||
iterator : Text -> Match_Iterator
|
||||
iterator self input = Match_Iterator.new self input
|
||||
|
||||
## Return the number of groups in the underlying RegexObject.
|
||||
Note, the count includes group 0 (the whole match) as well.
|
||||
group_count : Integer
|
||||
@ -154,32 +246,51 @@ type Pattern_2
|
||||
Performs the regex match, and iterates through the results. Yields both
|
||||
the matched parts of the string, and the 'filler' parts between them.
|
||||
|
||||
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
|
||||
matches and replacement boundaries can fall in the middle of multi-
|
||||
character graphemes, thereby splitting them apart.
|
||||
|
||||
At each step, it yields a Match_Iterator_Value, whivch has either a filler
|
||||
and a match, or just the final filler. A Match_Iterator_Value.Last value is
|
||||
return at the end, and only at the end.
|
||||
|
||||
Optionally, you can call `early_exit` to have it return the remainder of
|
||||
the string, unmatched, as a single Last value. (Used for `replace` with
|
||||
`only_first=True`.)
|
||||
type Match_Iterator
|
||||
new : Pattern_2 -> Text -> Match_Iterator
|
||||
new pattern input = Match_Iterator.Value pattern input 0
|
||||
|
||||
Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
|
||||
|
||||
## Return the next match, or the last filler string if there is no
|
||||
additional match.
|
||||
|
||||
Also returns the next iterator, if there was a match.
|
||||
next : Match_Iterator_Value
|
||||
next self =
|
||||
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
|
||||
case regex_result.isMatch of
|
||||
False ->
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
Match_Iterator_Value.Last filler_span
|
||||
True ->
|
||||
match_start = regex_result.getStart 0
|
||||
filler_range = Range.new self.cursor match_start
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
match = Match_2.Value self.pattern regex_result self.input
|
||||
next_cursor = match.utf16_end 0
|
||||
next_cursor = match.utf_16_end 0
|
||||
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
|
||||
Match_Iterator_Value.Next filler_span match next_iterator
|
||||
|
||||
## Returns the remainder of the string, unmatched.
|
||||
early_exit : Match_Iterator_Value
|
||||
early_exit self =
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = Utf_16_Span.Value filler_range self.input
|
||||
Match_Iterator_Value.Last filler_span
|
||||
|
||||
to_text_debug : Vector Text
|
||||
to_text_debug self =
|
||||
vb = Vector.new_builder
|
||||
|
@ -0,0 +1,144 @@
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Extensions
|
||||
import project.Data.Text.Regex.Match_2.Match_2
|
||||
import project.Data.Text.Regex.Pattern_2.Match_Iterator_Value
|
||||
import project.Data.Text.Regex.Pattern_2.Pattern_2
|
||||
import project.Data.Text.Regex_2
|
||||
import project.Data.Text.Regex_2.No_Such_Group
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_State.Illegal_State
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
polyglot java import java.lang.StringBuilder
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
|
||||
type Replacer
|
||||
## PRIVATE
|
||||
|
||||
Implements a replacement for a regular expression.
|
||||
|
||||
Pattern_2.replace uses a Replacer to replace each regex match with
|
||||
a replacement string. This string can contain references to match
|
||||
groups from the original regex.
|
||||
|
||||
The `new` smart constructor parses a Text into a vector of
|
||||
Replacements. Each Replacement is either a literal string or a
|
||||
group number. To provide a replacement for a regex match, the
|
||||
Replacer iterates through the Replacement vector, substitutes
|
||||
the match group contents for each group number, and concatenates
|
||||
all the strings together to form the full replacement string.
|
||||
Value (replacement : Vector Replacement)
|
||||
|
||||
## Creates a new Replacer.
|
||||
|
||||
Arguments
|
||||
- replacement_string: a string, possibly containing group references,
|
||||
that will be used to provide a replacement in a regex match.
|
||||
new : Text -> Pattern_2 -> Replacer ! No_Such_Group
|
||||
new replacement_string pattern =
|
||||
Replacer.Value (build_replacement_vector_cached replacement_string pattern)
|
||||
|
||||
## Build a replacement string from a match.
|
||||
|
||||
Arguments:
|
||||
- match: the match from the original string that is to be replaced.
|
||||
replace : Match_2 -> Text
|
||||
replace self match =
|
||||
string_builder = StringBuilder.new
|
||||
self.replacement.each replacement->
|
||||
s = case replacement of
|
||||
Replacement.Literal text -> text
|
||||
Replacement.Substitution group_number -> match.text group_number
|
||||
string_builder.append s
|
||||
string_builder.toString
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Get the size of the Replacer LRU cache. For testing.
|
||||
get_lru_size : Integer
|
||||
get_lru_size = Replacer_Cache.getLruSize
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Look up a replacement string in the Replacer LRU cache. For testing.
|
||||
replacer_cache_lookup : Text -> Replacer | Nothing
|
||||
replacer_cache_lookup replacement_string = Replacer_Cache.get replacement_string
|
||||
|
||||
## PRIVATE
|
||||
group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))"
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Build a replacement vector.
|
||||
|
||||
Parse the replacement string into an alternating series of literal
|
||||
strings and group reference numbers.
|
||||
|
||||
Uses Replacement_Cache to avoid rebuilding the vector for recently used
|
||||
replacement strings.
|
||||
build_replacement_vector_cached : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector_cached replacement_string pattern =
|
||||
Replacer_Cache.get_or_set replacement_string _->
|
||||
build_replacement_vector replacement_string pattern
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Build a replacement vector.
|
||||
|
||||
Parse the replacement string into an alternating series of literal
|
||||
strings and group reference numbers.
|
||||
build_replacement_vector : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector replacement_string pattern =
|
||||
replacement_pattern = Regex_2.compile group_reference_regex
|
||||
it = replacement_pattern.iterator replacement_string
|
||||
|
||||
builder = Vector.new_builder
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
replacement = parse_group_number pattern match
|
||||
replacement.if_not_error <|
|
||||
builder.append (Replacement.Literal filler.text)
|
||||
builder.append replacement
|
||||
@Tail_Call go next_it
|
||||
Match_Iterator_Value.Last filler ->
|
||||
builder.append (Replacement.Literal filler.text)
|
||||
result = go it
|
||||
result.if_not_error <|
|
||||
builder.to_vector
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Parse a capture group reference.
|
||||
|
||||
Arguments:
|
||||
- pattern: the Pattern_2 used to initiate the replacement. This is used
|
||||
to identify and validate capture groups.
|
||||
- match: the match of the replacement string against group_reference_regex.
|
||||
|
||||
Returns a Replacement: a group number, or, in the case of `$$`, a literal.
|
||||
|
||||
See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
|
||||
parse_group_number : Pattern_2 -> Match_2 -> Replacement ! No_Such_Group
|
||||
parse_group_number pattern match = case match.text.take 2 of
|
||||
"$$" -> Replacement.Literal "$"
|
||||
"$<" ->
|
||||
# Group 6 contains the group name without the `<>`.
|
||||
group_name = match.text 6
|
||||
Replacement.Substitution (pattern.lookup_group group_name)
|
||||
"$&" -> Replacement.Substitution 0
|
||||
_ ->
|
||||
n = Integer.parse <| match.text 2
|
||||
Replacement.Substitution (pattern.lookup_group n)
|
||||
|
||||
type Replacement
|
||||
## A string literal to replace with.
|
||||
Literal (text : Text)
|
||||
|
||||
## Target group to insert.
|
||||
Substitution (group_number : Integer)
|
@ -7,6 +7,7 @@ import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Errors.Common import Syntax_Error
|
||||
|
||||
@ -17,18 +18,21 @@ polyglot java import java.util.regex.Pattern as Java_Pattern
|
||||
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want to
|
||||
compile.
|
||||
compile. Must be non-empty.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
|
||||
If an empty regex is used, `compile` throws an Illegal_Argument error.
|
||||
|
||||
? Why Compile?
|
||||
While many regex engines are able to cache ad-hoc patterns, it is often
|
||||
useful to be able to manually retain a pattern that you have computed. This
|
||||
function exists so you can hold onto the resultant `Pattern_2` object,
|
||||
instead of immediately proceeding to match using it.
|
||||
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error
|
||||
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error | Illegal_Argument
|
||||
compile self expression case_insensitive=Nothing =
|
||||
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
|
||||
options_string = if case_insensitive == True then "usgi" else "usg"
|
||||
|
||||
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
|
||||
|
@ -843,8 +843,8 @@ type Table
|
||||
parse_problem_builder.attach_problems_before on_problems <|
|
||||
Table.new new_columns
|
||||
|
||||
## Replaces the first, last, or all occurrences of `term` with
|
||||
`new_text` in each text row of selected columns.
|
||||
## Replaces the first, or all occurrences of `term` with `new_text` in each
|
||||
text row of selected columns.
|
||||
If `term` is empty, the function returns the table unchanged.
|
||||
|
||||
This method follows the exact replacement semantics of the
|
||||
@ -854,15 +854,13 @@ type Table
|
||||
- columns: Column selection criteria or a column name or index.
|
||||
- term: The term to find.
|
||||
- new_text: The new text to replace occurrences of `term` with.
|
||||
If `matcher` is a `Regex_Matcher`, `new_text` can include replacement
|
||||
patterns (such as `$<n>`) for a marked group.
|
||||
- mode: Specifies which occurences of term the engine tries to find. When the
|
||||
mode is `First` or `Last`, this method replaces the first or last occurence
|
||||
of term in each individual table cell. If set to `All`, it replaces all
|
||||
occurences of term.
|
||||
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
|
||||
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
|
||||
regular expression and matched using the associated options.
|
||||
If use_regex is true, `new_text` can include replacement patterns
|
||||
(such as `$<n>`) for a marked group.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
- only_first: If True, only replace the first match.
|
||||
- use_regex: If true, the term is used as a regular expression.
|
||||
- on_problems: Specifies how to handle if a problem occurs, raising as a
|
||||
warning by default.
|
||||
|
||||
@ -881,21 +879,21 @@ type Table
|
||||
> Example
|
||||
Remove leading and trailing spaces from cells in multiple columns.
|
||||
|
||||
table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" matcher=Regex_Matcher.Value
|
||||
table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" use_regex=True
|
||||
|
||||
> Example
|
||||
Replace texts in quotes with parentheses in column at index 1.
|
||||
|
||||
table.replace_text 1 '"(.*?)"' '($1)' matcher=Regex_Matcher.Value
|
||||
replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Problem_Behavior -> Table
|
||||
replace_text self columns=[0] term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
|
||||
table.replace_text 1 '"(.*?)"' '($1)' use_regex=True
|
||||
replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Problem_Behavior -> Table
|
||||
replace_text self columns=[0] term="" new_text="" case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else
|
||||
problem_builder = Problem_Builder.new
|
||||
|
||||
selection = self.columns_helper.select_columns_helper columns reorder=False problem_builder
|
||||
selected_names = Map.from_vector (selection.map column-> [column.name, True])
|
||||
|
||||
map_preserve_name column f = column.map f . rename column.name
|
||||
do_replace = _.replace term new_text mode matcher
|
||||
do_replace = _.replace term new_text case_sensitivity=case_sensitivity only_first=only_first use_regex=use_regex
|
||||
do_replace_only_text = case _ of
|
||||
item : Text -> do_replace item
|
||||
item -> item
|
||||
|
@ -18,7 +18,7 @@ type Naming_Helpers
|
||||
sanitize_name : Text -> Text
|
||||
sanitize_name name =
|
||||
# Using the regex matcher due to the #5831 bug.
|
||||
name.replace '\0' '\\\\0' matcher=Regex_Matcher.Value
|
||||
name.replace '\0' '\\0' use_regex=True
|
||||
|
||||
## PRIVATE
|
||||
Generates a column name for a binary operation.
|
||||
|
@ -0,0 +1,51 @@
|
||||
package org.enso.base;
|
||||
|
||||
import org.graalvm.collections.Pair;
|
||||
import org.graalvm.polyglot.Value;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.function.Function;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class Replacer_Cache {
|
||||
private static final int lruSize = 5;
|
||||
|
||||
// Circular buffer containing the most recent cache keys.
|
||||
private static final List<Pair<String, Value>> lru = new ArrayList<>(lruSize);
|
||||
|
||||
static {
|
||||
for (int i = 0; i < lruSize; ++i) {
|
||||
lru.add(null);
|
||||
}
|
||||
}
|
||||
|
||||
// Index into the circular buffer.
|
||||
private static int nextSlot = 0;
|
||||
|
||||
public static Value get_or_set(String key, Function<Void, Value> value_producer) {
|
||||
Value value = get(key);
|
||||
if (value == null) {
|
||||
value = value_producer.apply(null);
|
||||
lru.set(nextSlot, Pair.create(key, value));
|
||||
nextSlot = (nextSlot + 1) % lruSize;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
// Visible for testing.
|
||||
public static Value get(String key) {
|
||||
for (int i = 0; i < lruSize; ++i) {
|
||||
Pair<String, Value> pair = lru.get(i);
|
||||
if (pair != null && pair.getLeft().equals(key)) {
|
||||
return lru.get(i).getRight();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static int getLruSize() {
|
||||
return lruSize;
|
||||
}
|
||||
}
|
@ -644,7 +644,7 @@ spec =
|
||||
bools = ["bools", [False, False, True, True]]
|
||||
texts = ["texts", ["foo", "bar", "baz", "spam"]]
|
||||
table = Table.new [bools, texts]
|
||||
actual = table.replace_text "texts" "(a|o)" "$1e" matcher=Regex_Matcher.Value
|
||||
actual = table.replace_text "texts" "(a|o)" "$1e" use_regex=True
|
||||
actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"]
|
||||
Problems.assume_no_problems actual
|
||||
|
||||
|
@ -1,199 +1,98 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Span.Span
|
||||
import Standard.Base.Data.Text.Span.Utf_16_Span
|
||||
import Standard.Base.Data.Text.Regex_2
|
||||
import Standard.Base.Data.Text.Regex.Match_2.Match_2
|
||||
import Standard.Base.Data.Text.Regex.Pattern_2.Pattern_2
|
||||
import Standard.Base.Data.Text.Regex.Replacer.Replacer
|
||||
import Standard.Base.Data.Text.Regex_2
|
||||
import Standard.Base.Data.Text.Regex_2.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
|
||||
# default_mask = Java_Pattern.CANON_EQ.bit_or Java_Pattern.UNICODE_CASE . bit_or Java_Pattern.UNICODE_CHARACTER_CLASS
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
|
||||
spec =
|
||||
##
|
||||
Test.group "The default regex engine's options handling" <|
|
||||
Test.group "Compile" <|
|
||||
Test.specify "should be able to be compiled" <|
|
||||
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
|
||||
pattern . should_be_a Pattern_2
|
||||
|
||||
Test.specify "should convert options to Java" <|
|
||||
options = [Regex_Option.Comments, Regex_Option.Multiline, Default_Engine.Option.Unix_Lines]
|
||||
expected_mask = Java_Pattern.UNIX_LINES.bit_or Java_Pattern.COMMENTS . bit_or Java_Pattern.MULTILINE . bit_or default_mask
|
||||
actual_mask = Default_Engine.from_enso_options options
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
|
||||
Regex_2.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
|
||||
|
||||
actual_mask . should_equal expected_mask
|
||||
|
||||
Test.specify "should specify the unicode options by default" <|
|
||||
actual_mask = Default_Engine.from_enso_options []
|
||||
|
||||
actual_mask . should_equal default_mask
|
||||
|
||||
Test.specify "should handle ascii matching by disabling unicode" <|
|
||||
actual_mask = Default_Engine.from_enso_options [Regex_Option.Ascii_Matching]
|
||||
actual_mask . should_equal 0
|
||||
|
||||
Test.specify "should result in an error when an option is invalid" <|
|
||||
Default_Engine.from_enso_options [""] . should_fail_with Invalid_Option
|
||||
Default_Engine.from_enso_options ["", Regex_Option.Ascii_Matching] . should_fail_with Invalid_Option
|
||||
|
||||
Test.group "The default regex engine (Default_Engine)" <|
|
||||
|
||||
Test.specify "should be able to compile patterns with no options" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "^a$" []
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options . should_equal []
|
||||
pattern.internal_pattern.flags . should_equal default_mask
|
||||
|
||||
Test.specify "should be able to compile patterns with global options" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "^a$" [Regex_Option.Multiline]
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options . should_equal [Regex_Option.Multiline]
|
||||
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.MULTILINE)
|
||||
|
||||
Test.specify "should be able to compile patterns with engine-specific options" <|
|
||||
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
|
||||
pattern = engine.compile "^a$" []
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options . should_equal [Default_Engine.Option.Literal_Pattern]
|
||||
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.LITERAL)
|
||||
|
||||
Test.specify "should be able to compile patterns with combined options" <|
|
||||
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
|
||||
pattern = engine.compile "^a$" [Regex_Option.Comments]
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options.contains Default_Engine.Option.Literal_Pattern . should_be_true
|
||||
pattern.options.contains Regex_Option.Comments . should_be_true
|
||||
pattern.internal_pattern.flags . should_equal (default_mask . bit_or Java_Pattern.LITERAL . bit_or Java_Pattern.COMMENTS)
|
||||
|
||||
Test.specify "should return a syntax error of the regex syntax is invalid" <|
|
||||
engine = Default_Engine.new
|
||||
engine.compile "^(a" [] . should_fail_with Syntax_Error
|
||||
|
||||
Test.specify "should throw an invalid options error if an option is invalid" <|
|
||||
engine = Default_Engine.new
|
||||
engine.compile "^a$" ["invalid"] . should_fail_with Invalid_Option
|
||||
Test.specify "should disallow empty patterns in `compile`" <|
|
||||
Regex_2.compile "" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Escape" <|
|
||||
Test.specify "should escape an expression for use as a literal" <|
|
||||
pattern = "http://example.com"
|
||||
engine = Default_Engine.new
|
||||
engine.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||
|
||||
Test.group "The default regex engine's Pattern.matches" <|
|
||||
engine = Default_Engine.new
|
||||
Regex_2.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||
|
||||
Test.group "Pattern.matches" <|
|
||||
Test.specify "should return True when the pattern matches against the input" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_true
|
||||
|
||||
Test.specify "should return False when the pattern doesn't match against the input" <|
|
||||
pattern = engine.compile "aaz" []
|
||||
pattern = Regex_2.compile "aaz"
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_false
|
||||
|
||||
Test.specify "should check for full matches" <|
|
||||
pattern = engine.compile "f.o" []
|
||||
pattern = Regex_2.compile "f.o"
|
||||
pattern.matches "foo" . should_be_true
|
||||
pattern.matches "foobar" . should_be_false
|
||||
|
||||
Test.group "The default regex engine's Pattern.match" <|
|
||||
engine = Default_Engine.new
|
||||
Test.specify "`matches` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.matches "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Pattern.match" <|
|
||||
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
match.text 0 . should_equal input
|
||||
|
||||
Test.specify "should return `Nothing` if there are no matches in first mode" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "abc"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `match` at most N instances of the pattern in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=3
|
||||
match.length . should_equal 3
|
||||
match.at 0 . group 0 . should_equal "ab"
|
||||
match.at 1 . group 0 . should_equal "cd"
|
||||
match.at 2 . group 0 . should_equal "ef"
|
||||
|
||||
Test.specify "should `match` fewer than N instances when there are fewer than N in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdef"
|
||||
match = pattern.match input mode=5
|
||||
match.length . should_equal 3
|
||||
match.at 0 . group 0 . should_equal "ab"
|
||||
match.at 1 . group 0 . should_equal "cd"
|
||||
match.at 2 . group 0 . should_equal "ef"
|
||||
|
||||
Test.specify "should return `Nothing` when a counted match fails" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=3
|
||||
match = pattern.match input
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
pattern = Regex_2.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=Regex_Mode.All
|
||||
match.length . should_equal 5
|
||||
match.at 0 . group 0 . should_equal "ab"
|
||||
match.at 1 . group 0 . should_equal "cd"
|
||||
match.at 2 . group 0 . should_equal "ef"
|
||||
match.at 3 . group 0 . should_equal "gh"
|
||||
match.at 4 . group 0 . should_equal "ij"
|
||||
matches = pattern.match_all input
|
||||
matches.length . should_equal 5
|
||||
matches.at 0 . text 0 . should_equal "ab"
|
||||
matches.at 1 . text 0 . should_equal "cd"
|
||||
matches.at 2 . text 0 . should_equal "ef"
|
||||
matches.at 3 . text 0 . should_equal "gh"
|
||||
matches.at 4 . text 0 . should_equal "ij"
|
||||
|
||||
Test.specify "should return `Nothing` when an all match match fails" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
Test.specify "should return `[]` when an all match match fails" <|
|
||||
pattern = Regex_2.compile "(aa)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=Regex_Mode.All
|
||||
match . should_equal Nothing
|
||||
match = pattern.match_all input
|
||||
match . should_equal []
|
||||
|
||||
Test.specify "should be able to `match` the pattern against the entire input" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Regex_Mode.Full
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match.text 0 . should_equal input
|
||||
Test.specify "`match` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.match "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "should return `Nothing` if a full match does not match the entire input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "aa ab"
|
||||
full_match = pattern.match input mode=Regex_Mode.Full
|
||||
full_match . should_equal Nothing
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
Test.specify "`match_all` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.match_all "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "should be able to `match` the pattern against bounded input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=(Regex_Mode.Bounded 2 8)
|
||||
match.length . should_equal 3
|
||||
match.at 0 . text 0 . should_equal "cd"
|
||||
match.at 1 . text 0 . should_equal "ef"
|
||||
match.at 2 . text 0 . should_equal "gh"
|
||||
|
||||
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
|
||||
pattern = engine.compile "" []
|
||||
match_1 = pattern.match "" mode=Regex_Mode.All
|
||||
match_1.length . should_equal 1
|
||||
match_1.at 0 . start 0 . should_equal 0
|
||||
match_1.at 0 . end 0 . should_equal 0
|
||||
|
||||
match_2 = pattern.match "ABC" mode=Regex_Mode.All
|
||||
match_2.length . should_equal 4
|
||||
match_2.at 0 . start 0 . should_equal 0
|
||||
match_2.at 0 . end 0 . should_equal 0
|
||||
match_2.at 1 . start 0 . should_equal 1
|
||||
match_2.at 1 . end 0 . should_equal 1
|
||||
match_2.at 3 . start 0 . should_equal 3
|
||||
match_2.at 3 . end 0 . should_equal 3
|
||||
|
||||
Test.group "The default regex engine's Pattern.find" <|
|
||||
Test.group "Pattern_2.find and .find_all" <|
|
||||
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
@ -229,6 +128,14 @@ spec =
|
||||
Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"]
|
||||
|
||||
Test.specify "`find` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.find "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`find_all` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.find_all "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
##
|
||||
Test.group "The default regex engine's Pattern.split" <|
|
||||
engine = Default_Engine.new
|
||||
@ -279,142 +186,122 @@ spec =
|
||||
match.at 3 . should_equal "e"
|
||||
match.at 4 . should_equal "f"
|
||||
|
||||
Test.group "The default regex engine's Pattern.replace" <|
|
||||
engine = Default_Engine.new
|
||||
|
||||
Test.group "Pattern_2.replace" <|
|
||||
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
|
||||
pattern = engine.compile "abc" []
|
||||
pattern = Regex_2.compile "abc"
|
||||
input = "aa ab abc a bc abc"
|
||||
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
|
||||
match = pattern.replace input "REPLACED" only_first=True
|
||||
match . should_be_a Text
|
||||
match . should_equal "aa ab REPLACED a bc abc"
|
||||
|
||||
Test.specify "should return the string unchanged if there are no matches to replace in first mode" <|
|
||||
pattern = engine.compile "xyz" []
|
||||
Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <|
|
||||
pattern = Regex_2.compile "xyz"
|
||||
input = "aa ab ac ad"
|
||||
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace at most N instances of the pattern in the input" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED" mode=3
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED aa ax"
|
||||
|
||||
Test.specify "should replace fewer than N instances when there are fewer than N in the input" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED" mode=10
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
|
||||
|
||||
Test.specify "should return the input when a counted replace fails" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.replace input "REPLACED" mode=3
|
||||
match = pattern.replace input "REPLACED" only_first=True
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the all instances of the pattern in the input" <|
|
||||
pattern = engine.compile "aa" []
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
|
||||
|
||||
Test.specify "should return the input when an all replace fails" <|
|
||||
pattern = engine.compile "aa" []
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "abcdefghij"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the entire input only if it matches" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACED"
|
||||
|
||||
Test.specify "should correctly replace entire input in Full mode even if partial matches are possible" <|
|
||||
pattern = engine.compile "(aa)+" []
|
||||
pattern.replace "aaa" "REPLACED" mode=Regex_Mode.Full . should_equal "aaa"
|
||||
pattern.replace "aaaa" "REPLACED" mode=Regex_Mode.Full . should_equal "REPLACED"
|
||||
|
||||
Test.specify "should return the input for a full replace if the pattern doesn't match the entire input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "aa ab"
|
||||
full_match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
|
||||
full_match . should_equal input
|
||||
|
||||
Test.specify "should not perform overlapping replacements in counted mode" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
result = pattern.replace input "REPLACED" mode=3
|
||||
result . should_equal "REPLACEDREPLACEDREPLACEDghij"
|
||||
|
||||
Test.specify "should not perform overlapping replacements in all mode" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
pattern = Regex_2.compile "(..)"
|
||||
input = "aa ab"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACEDREPLACEDb"
|
||||
|
||||
Test.specify "should handle capture groups in replacement" <|
|
||||
pattern = engine.compile "(?<capture>[a-z]+)" []
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=0 . should_equal "foo bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=1 . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=2 . should_equal "[foo] [bar], baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=3 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=4 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
|
||||
pattern = Regex_2.compile "(?<capture>[a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=0 . should_equal "foo bar, baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=1 . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=2 . should_equal "[foo] [bar], baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=3 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=4 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
|
||||
pattern.replace "foo bar, baz" "[$<capture>]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$<capture>]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
Test.specify "should handle capture groups in replacement in All mode" <|
|
||||
pattern = engine.compile "([a-z]+)" []
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.Full . should_equal "foo bar, baz"
|
||||
pattern.replace "foo" "[$1]" mode=Regex_Mode.Full . should_equal "[foo]"
|
||||
pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
pattern_2 = engine.compile '<a href="(?<addr>.*?)">(?<name>.*?)</a>' []
|
||||
pattern_2.replace '<a href="url">content</a>' "$2 <- $1" mode=Regex_Mode.Full . should_equal "content <- url"
|
||||
pattern_2.replace '<a href="url">content</a>' "${name} <- ${addr}" mode=Regex_Mode.Full . should_equal "content <- url"
|
||||
Test.specify "should handle unicode in capture group names" <|
|
||||
pattern = Regex_2.compile "(?<건반>[a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]"
|
||||
|
||||
Test.group "Match.group" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
Text.group "should correctly evaluate documentation examples" <|
|
||||
Test.specify "example 1" <|
|
||||
pattern = Regex_2.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' . should_equal 'ba'
|
||||
|
||||
Test.specify "example 2" <|
|
||||
pattern = Regex_2.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!'
|
||||
|
||||
Test.specify "example 3" <|
|
||||
pattern = Regex_2.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!'
|
||||
|
||||
Test.specify "example 4" <|
|
||||
pattern = Regex_2.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz'
|
||||
|
||||
Test.specify "example 5" <|
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match . should_equal "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
Test.specify "example 6" <|
|
||||
pattern = Regex_2.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
|
||||
|
||||
Test.specify "`replace` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.replace "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Match.text" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the full match with index 0" <|
|
||||
match.group 0 . should_equal "aa ab abc a bc bcd"
|
||||
match.text 0 . should_equal "aa ab abc a bc bcd"
|
||||
|
||||
Test.specify "should return the group contents if it matches by index" <|
|
||||
match.group 1 . should_equal "aa ab "
|
||||
match.text 1 . should_equal "aa ab "
|
||||
|
||||
Test.specify "should return the group contents if it matches by name" <|
|
||||
match.group "letters" . should_equal "abc a bc bcd"
|
||||
match.text "letters" . should_equal "abc a bc bcd"
|
||||
|
||||
Test.specify "should return Nothing if the group did not match" <|
|
||||
match.group 3 . should_equal Nothing
|
||||
match.text 3 . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
|
||||
match.group "fail" . should_fail_with No_Such_Group
|
||||
match.group 5 . should_fail_with No_Such_Group
|
||||
match.text "fail" . should_fail_with No_Such_Group
|
||||
match.text 5 . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should make named groups accessible by index" <|
|
||||
match.group 2 . should_equal (match.group "letters")
|
||||
match.text 2 . should_equal (match.text "letters")
|
||||
|
||||
Test.group "Match.groups" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the results of all groups" <|
|
||||
groups = match.groups
|
||||
@ -485,59 +372,57 @@ spec =
|
||||
match.end 5 . should_fail_with No_Such_Group
|
||||
match.end "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf16_start" <|
|
||||
Test.group "Match.utf_16_start" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the start of a group by index" <|
|
||||
match.utf16_start 1 . should_equal 0
|
||||
match.utf_16_start 1 . should_equal 0
|
||||
|
||||
Test.specify "should return the start of a group by name" <|
|
||||
match.utf16_start "letters" . should_equal 6
|
||||
match.utf_16_start "letters" . should_equal 6
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf16_start 3 . should_equal Nothing
|
||||
match.utf16_start "empty" . should_equal Nothing
|
||||
match.utf_16_start 3 . should_equal Nothing
|
||||
match.utf_16_start "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf16_start 5 . should_fail_with No_Such_Group
|
||||
match.utf16_start "nonexistent" . should_fail_with No_Such_Group
|
||||
match.utf_16_start 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_start "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf16_end" <|
|
||||
Test.group "Match.utf_16_end" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the end of a group by index" <|
|
||||
match.utf16_end 1 . should_equal 6
|
||||
match.utf_16_end 1 . should_equal 6
|
||||
|
||||
Test.specify "should return the end of a group by name" <|
|
||||
match.utf16_end "letters" . should_equal 18
|
||||
match.utf_16_end "letters" . should_equal 18
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf16_end 3 . should_equal Nothing
|
||||
match.utf16_end "empty" . should_equal Nothing
|
||||
match.utf_16_end 3 . should_equal Nothing
|
||||
match.utf_16_end "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf16_end 5 . should_fail_with No_Such_Group
|
||||
match.utf16_end "nonexistent" . should_fail_with No_Such_Group
|
||||
match.utf_16_end 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_end "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
##
|
||||
Test.group "Match.span" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should get the span of a group by index" <|
|
||||
match.span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
|
||||
match.span 1 . should_equal (Span.Value (0.up_to 6) input)
|
||||
|
||||
Test.specify "should get the span of a group by name" <|
|
||||
match.span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
|
||||
match.span "letters" . should_equal (Span.Value (6.up_to 18) input)
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.span 3 . should_equal Nothing
|
||||
@ -547,45 +432,35 @@ spec =
|
||||
match.span 5 . should_fail_with No_Such_Group
|
||||
match.span "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.start_position" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
Test.group "Match.utf_16_span" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the region start over which self match was performed" <|
|
||||
match.start_position . should_equal 0
|
||||
Test.specify "should get the UTF16 span of a group by index" <|
|
||||
match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
|
||||
|
||||
Test.group "Match.end_position" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
Test.specify "should get the UTF16 span of a group by name" <|
|
||||
match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
|
||||
|
||||
Test.specify "should return the region end over which self match was performed" <|
|
||||
match.end_position . should_equal 18
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf_16_span 3 . should_equal Nothing
|
||||
match.utf_16_span "empty" . should_equal Nothing
|
||||
|
||||
Test.group "Regex options handling" <|
|
||||
Test.specify "should work properly with flag options" <|
|
||||
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[]
|
||||
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
|
||||
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf_16_span 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_span "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should properly override vector options" <|
|
||||
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[Regex_Option.Multiline, Regex_Option.Case_Insensitive]
|
||||
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Case_Insensitive, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
|
||||
Test.group "caching" <|
|
||||
Test.specify "Replacer cache drops old values" <|
|
||||
pattern = Regex_2.compile('([a-c])')
|
||||
|
||||
Test.group "Regexes" <|
|
||||
Test.specify "should be able to be compiled" <|
|
||||
pattern = Regex.compile "(?<dots>..)" case_insensitive=True
|
||||
pattern . should_be_a Default_Engine.Pattern.Value
|
||||
pattern.options . should_equal [Regex_Option.Case_Insensitive]
|
||||
|
||||
Test.specify "should be able to be escaped" <|
|
||||
pattern = "http://example.com"
|
||||
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||
|
||||
## TODO: Missing tests for No_Such_Group_Error
|
||||
# Add enough values to flush out the first values.
|
||||
0.up_to get_lru_size+1 . map i->
|
||||
result = pattern.replace "abcdef" ("$1$1x" + i.to_text)
|
||||
result . should_not_equal Nothing
|
||||
replacer_cache_lookup "$1$1x0" . should_equal Nothing
|
||||
replacer_cache_lookup "$1$1x1" . should_not_equal Nothing
|
||||
|
||||
main = Test_Suite.run_main spec
|
||||
|
@ -7,6 +7,7 @@ import Standard.Base.Errors.Common.Index_Out_Of_Bounds
|
||||
import Standard.Base.Errors.Common.Incomparable_Values
|
||||
import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Base.IO
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
|
||||
@ -15,6 +16,7 @@ from Standard.Base.Data.Index_Sub_Range.Index_Sub_Range import all
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
import Standard.Base.Data.Text.Extensions
|
||||
|
||||
type Auto
|
||||
Value a
|
||||
@ -1190,9 +1192,9 @@ spec =
|
||||
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
|
||||
|
||||
Test.specify "find should produce correct spans" <|
|
||||
"Hello World!".find ".o" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!")
|
||||
"Hello World!".find_all ".o" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
|
||||
"foobar".find "BAR" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 6) "foobar")
|
||||
"Hello World!".find ".o" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!")
|
||||
"Hello World!".find_all ".o" . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
|
||||
"foobar".find "BAR" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 6) "foobar")
|
||||
|
||||
Test.specify "should handle accents and other multi-point graphemes" <|
|
||||
accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
|
||||
@ -1201,29 +1203,20 @@ spec =
|
||||
accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}'
|
||||
|
||||
# Check both UTF16 spans
|
||||
accents.find_all 'h' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents]
|
||||
accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents]
|
||||
accents.find_all 'h' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents]
|
||||
accents.find_all 'e\u{301}' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents]
|
||||
|
||||
# Check both grapheme spans
|
||||
accents.find_all 'h' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
|
||||
accents.find_all 'e\u{301}' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
|
||||
accents.find_all 'h' . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
|
||||
accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
|
||||
|
||||
# Check contents to make sure the spans' ranges are ok
|
||||
accents.find 'h' . text 0 . should_equal 'h'
|
||||
accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}'
|
||||
|
||||
Test.specify "should correctly handle regex edge cases in locate" pending="Figure out how to make Regex correctly handle empty patterns." <|
|
||||
regex = Regex_Matcher.Value
|
||||
"".match "foo" matcher=regex . should_equal Nothing
|
||||
"".match "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing
|
||||
"".match_all "foo" matcher=regex . should_equal []
|
||||
"".match "" matcher=regex . should_equal ""
|
||||
"".match_all "" matcher=regex . should_equal [""]
|
||||
"".match "" matcher=regex mode=Matching_Mode.Last . should_equal ""
|
||||
abc = 'A\u{301}ßC'
|
||||
abc.match "" matcher=regex . should_equal abc
|
||||
abc.match_all "" matcher=regex . should_equal ["", "", "", "", ""]
|
||||
abc.match "" matcher=regex mode=Matching_Mode.Last . should_equal ""
|
||||
Test.specify "should correctly handle regex edge cases in `find`" <|
|
||||
"".find "foo" . should_equal Nothing
|
||||
"".find_all "foo" . should_equal []
|
||||
|
||||
Test.specify "should handle overlapping matches as shown in the examples" <|
|
||||
"aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (1.up_to 3) "aaa")
|
||||
@ -1256,6 +1249,12 @@ spec =
|
||||
txt.find "^m..a..z.a$" . text 0 . should_equal "maza건반zaa"
|
||||
txt.find "a..z" . text 0 . should_equal "a건반z"
|
||||
|
||||
Test.specify "`find` with an empty pattern should be an error" <|
|
||||
'b'.find '' . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`find_all` with an empty pattern should be an error" <|
|
||||
'b'.find_all '' . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "should be possible in case-insensitive mode" <|
|
||||
"MY".find "my" Case_Sensitivity.Insensitive . text 0 . should_equal "MY"
|
||||
|
||||
@ -1281,20 +1280,20 @@ spec =
|
||||
expose normalization methods to allow developers to do it
|
||||
themselves.
|
||||
accents = 'a\u{301}e\u{301}o\u{301}'
|
||||
accents.find accent_1 . grapheme_span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
|
||||
accents.find accent_1 . span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
|
||||
|
||||
Test.specify "can return a vector of all match groups" <|
|
||||
"abc".find "ab((c)|(d))" . groups . should_equal ['abc', 'c', 'c', Nothing]
|
||||
|
||||
Test.specify "should default to group 0 in .span and .grapheme_span" <|
|
||||
"abacadae".find "a[bc]" . span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae")
|
||||
'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . grapheme_span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
|
||||
Test.specify "should default to group 0 in .span and .span" <|
|
||||
"abacadae".find "a[bc]" . utf_16_span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae")
|
||||
'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
|
||||
|
||||
Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
|
||||
"abacadae".find_all "a[bc]" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
|
||||
"abacadae".find_all "a." . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
"abacadae".find_all "a.*" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"]
|
||||
"abacadae".find_all "a.+?" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
"abacadae".find_all "a[bc]" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
|
||||
"abacadae".find_all "a." . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
"abacadae".find_all "a.*" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"]
|
||||
"abacadae".find_all "a.+?" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
|
||||
Test.specify "should allow access to match groups by number" <|
|
||||
"abcddd".find "ab(c(d+))" . text 0 . should_equal "abcddd"
|
||||
@ -1331,8 +1330,13 @@ spec =
|
||||
Test.specify "should expand a partial-grapheme match to the whole grapheme" <|
|
||||
'e\u{301}'.find '\u{301}' . text 0 . should_equal 'e\u{301}'
|
||||
|
||||
Test.specify "should not allow non-default locale" <|
|
||||
locale = Locale.new "en" "GB" "UTF-8"
|
||||
'a'.find 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
|
||||
'a'.find_all 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Text.match" <|
|
||||
Test.specify "should default to regex" <|
|
||||
Test.specify "should work correctly" <|
|
||||
"My Text: Goes Here".match "^My Text: (.+)$" . should_be_true
|
||||
"555-801-1923".match "^\d{3}-\d{3}-\d{4}$" . should_be_true
|
||||
"Hello".match "^[a-z]+$" . should_be_false
|
||||
@ -1344,12 +1348,19 @@ spec =
|
||||
"abcd".match "abc" . should_be_false
|
||||
"x".match "[a-z]" . should_be_true
|
||||
|
||||
Test.specify "`match` with an empty pattern should be an error" <|
|
||||
'b'.match '' . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "should be possible on unicode text" <|
|
||||
"Korean: 건반".match "^Korean: (.+)$" . should_be_true
|
||||
|
||||
Test.specify "should be possible in case-insensitive mode" <|
|
||||
"MY".match "my" Case_Sensitivity.Insensitive . should_be_true
|
||||
|
||||
Test.specify "should not allow non-default locale" <|
|
||||
locale = Locale.new "en" "GB" "UTF-8"
|
||||
'a'.match 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Regex splitting" <|
|
||||
Test.specify "should be possible on text" <|
|
||||
splits = "abcde".split "[bd]" Regex_Matcher.Value
|
||||
@ -1402,141 +1413,113 @@ spec =
|
||||
Test.group "Text.replace" <|
|
||||
Test.specify "should work as in examples" <|
|
||||
'aaa'.replace 'aa' 'b' . should_equal 'ba'
|
||||
"Hello World!".replace "[lo]" "#" matcher=Regex_Matcher.Value . should_equal "He### W#r#d!"
|
||||
"Hello World!".replace "l" "#" mode=Matching_Mode.First . should_equal "He#lo World!"
|
||||
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher.Value . should_equal '(abc) foo (bar) baz'
|
||||
'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA'
|
||||
'affib'.replace 'i' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb'
|
||||
"Hello World!".replace "[lo]" "#" use_regex=True . should_equal "He### W#r#d!"
|
||||
"Hello World!".replace "l" "#" only_first=True . should_equal "He#lo World!"
|
||||
'"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True . should_equal '(abc) foo (bar) baz'
|
||||
|
||||
Test.specify "works when mapped over a vector of inputs" <|
|
||||
inputs = ["axyz", "bxyz", "xabcz", "zazaz"]
|
||||
inputs.map (s-> s.replace "[abc]" "q" use_regex=True) . should_equal ["qxyz", "qxyz", "xqqqz", "zqzqz"]
|
||||
|
||||
Test.specify "should correctly handle empty-string edge cases" <|
|
||||
[Regex_Mode.All, Matching_Mode.First, Matching_Mode.Last] . each mode->
|
||||
'aaa'.replace '' 'foo' mode=mode . should_equal 'aaa'
|
||||
''.replace '' '' mode=mode . should_equal ''
|
||||
'a'.replace 'a' '' mode=mode . should_equal ''
|
||||
''.replace 'a' 'b' mode=mode . should_equal ''
|
||||
[True, False] . each only_first->
|
||||
'aaa'.replace '' 'foo' only_first=only_first . should_equal 'aaa'
|
||||
'a'.replace 'a' '' only_first=only_first . should_equal ''
|
||||
''.replace 'a' 'b' only_first=only_first . should_equal ''
|
||||
|
||||
'aba' . replace 'a' '' Matching_Mode.First . should_equal 'ba'
|
||||
'aba' . replace 'a' '' Matching_Mode.Last . should_equal 'ab'
|
||||
'aba' . replace 'a' '' only_first=True . should_equal 'ba'
|
||||
'aba' . replace 'a' '' . should_equal 'b'
|
||||
'aba' . replace 'c' '' . should_equal 'aba'
|
||||
|
||||
Test.specify "should correctly handle first, all and last matching with overlapping occurrences" <|
|
||||
"aaa aaa".replace "aa" "c" . should_equal "ca ca"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First . should_equal "ca aaa"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last . should_equal "aaa ac"
|
||||
"aaa aaa".replace "aa" "c" only_first=True . should_equal "ca aaa"
|
||||
|
||||
Test.specify "Regex `replace` with an empty pattern should be an error" <|
|
||||
'b'.replace '' 'c' use_regex=True . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "should correctly handle case-insensitive matches" <|
|
||||
'AaąĄ' . replace "A" "-" matcher=Text_Matcher.Case_Insensitive . should_equal '--ąĄ'
|
||||
'AaąĄ' . replace "A" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal '--ąĄ'
|
||||
'AaąĄ' . replace "A" "-" . should_equal '-aąĄ'
|
||||
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Sensitive . should_equal 'HeLlO wOrLd'
|
||||
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Insensitive . should_equal 'Hey, wOrLd'
|
||||
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Sensitive . should_equal 'HeLlO wOrLd'
|
||||
'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'Hey, wOrLd'
|
||||
|
||||
"Iiİı" . replace "i" "-" . should_equal "I-İı"
|
||||
"Iiİı" . replace "I" "-" . should_equal "-iİı"
|
||||
"Iiİı" . replace "İ" "-" . should_equal "Ii-ı"
|
||||
"Iiİı" . replace "ı" "-" . should_equal "Iiİ-"
|
||||
|
||||
"Iiİı" . replace "i" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı"
|
||||
"Iiİı" . replace "I" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı"
|
||||
"Iiİı" . replace "İ" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Ii-ı"
|
||||
"Iiİı" . replace "ı" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Iiİ-"
|
||||
"Iiİı" . replace "i" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı"
|
||||
"Iiİı" . replace "I" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı"
|
||||
"Iiİı" . replace "İ" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Ii-ı"
|
||||
"Iiİı" . replace "ı" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Iiİ-"
|
||||
|
||||
tr_insensitive = Text_Matcher.Case_Insensitive (Locale.new "tr")
|
||||
"Iiİı" . replace "i" "-" matcher=tr_insensitive . should_equal "I--ı"
|
||||
"Iiİı" . replace "I" "-" matcher=tr_insensitive . should_equal "-iİ-"
|
||||
"Iiİı" . replace "İ" "-" matcher=tr_insensitive . should_equal "I--ı"
|
||||
"Iiİı" . replace "ı" "-" matcher=tr_insensitive . should_equal "-iİ-"
|
||||
Test.specify "should correctly handle Unicode" <|
|
||||
'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA'
|
||||
'ß'.replace 'ß' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'A'
|
||||
'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
|
||||
'affib'.replace 'ffi' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb'
|
||||
|
||||
Test.specify "should correctly handle Unicode edge cases" <|
|
||||
'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
|
||||
'sśs\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'Ośs\u{301}'
|
||||
'śs\u{301}s' . replace 's' 'O' Matching_Mode.First . should_equal 'śs\u{301}O'
|
||||
'śss\u{301}' . replace 's' 'O' only_first=True . should_equal 'śOs\u{301}'
|
||||
|
||||
'sśs\u{301}' . replace 'ś' 'O' . should_equal 'sOO'
|
||||
'śss\u{301}' . replace 'ś' 'O' only_first=True . should_equal 'Oss\u{301}'
|
||||
|
||||
'sśs\u{301}' . replace 's\u{301}' 'O' . should_equal 'sOO'
|
||||
's\u{301}śs' . replace 's\u{301}' 'O' . should_equal 'OOs'
|
||||
|
||||
'SŚS\u{301}' . replace 's' 'O' . should_equal 'SŚS\u{301}'
|
||||
'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'SŚS\u{301}'
|
||||
'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First . should_equal 'ŚS\u{301}S'
|
||||
'ŚS\u{301}S' . replace 's' 'O' only_first=True . should_equal 'ŚS\u{301}S'
|
||||
|
||||
'SŚS\u{301}' . replace 'ś' 'O' . should_equal 'SŚS\u{301}'
|
||||
'SŚS\u{301}' . replace 's\u{301}' 'O' . should_equal 'SŚS\u{301}'
|
||||
|
||||
'SŚS\u{301}' . replace 's' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}'
|
||||
'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}'
|
||||
'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'ŚS\u{301}O'
|
||||
'SŚS\u{301}' . replace 's' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'OŚS\u{301}'
|
||||
'ŚS\u{301}S' . replace 's' 'O' only_first=True case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'ŚS\u{301}O' # 'ŚO\u{301}O' # 'ŚOS\u{301}S'
|
||||
|
||||
'SŚS\u{301}' . replace 'ś' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO'
|
||||
'SŚS\u{301}' . replace 's\u{301}' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO'
|
||||
'SŚS\u{301}' . replace 'ś' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO'
|
||||
'SŚS\u{301}' . replace 's\u{301}' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO'
|
||||
|
||||
'✨🚀🚧😍😃😍😎😙😉☺' . replace '🚧😍' '|-|:)' . should_equal '✨🚀|-|:)😃😍😎😙😉☺'
|
||||
'Rocket Science' . replace 'Rocket' '🚀' . should_equal '🚀 Science'
|
||||
|
||||
"Korean: 건반".replace "건반" "keyboard" . should_equal "Korean: keyboard"
|
||||
|
||||
Test.specify "will approximate ligature matches" <|
|
||||
# TODO do we want to improve this? highly non-trivial for very rare edge cases
|
||||
## Currently we lack 'resolution' to extract a partial match from
|
||||
the ligature to keep it, probably would need some special
|
||||
mapping.
|
||||
'ffiffi'.replace 'ff' 'aa' matcher=Text_Matcher.Case_Insensitive . should_equal 'aaaa'
|
||||
'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'aaffi'
|
||||
'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'ffiaa'
|
||||
'affiffib'.replace 'IF' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb'
|
||||
'aiffiffz' . replace 'if' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'a--fz'
|
||||
'AFFIB'.replace 'ffi' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'A-B'
|
||||
|
||||
'ß'.replace 'SS' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
|
||||
'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA'
|
||||
'ß'.replace 'S' 'A' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
|
||||
'ß'.replace 'S' 'A' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'A'
|
||||
'STRASSE'.replace 'ß' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'STRA-E'
|
||||
Test.specify "regex and non-regex replace handle accented grapheme splitting differently" <|
|
||||
'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
|
||||
'sśs\u{301}' . replace 's' 'O' use_regex=True . should_equal 'OśO\u{301}'
|
||||
|
||||
Test.specify "should perform simple replacement in Regex mode" <|
|
||||
"ababab".replace "b" "a" matcher=Regex_Matcher.Value . should_equal "aaaaaa"
|
||||
"ababab".replace "b" "a" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "aaabab"
|
||||
"ababab".replace "b" "a" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ababaa"
|
||||
"ababab".replace "b" "a" use_regex=True . should_equal "aaaaaa"
|
||||
"ababab".replace "b" "a" only_first=True use_regex=True . should_equal "aaabab"
|
||||
|
||||
"aaaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "cc"
|
||||
"aaaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "caa"
|
||||
"aaaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aac"
|
||||
"aaaa".replace "aa" "c" use_regex=True . should_equal "cc"
|
||||
"aaaa".replace "aa" "c" only_first=True use_regex=True . should_equal "caa"
|
||||
|
||||
"aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca"
|
||||
"aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca"
|
||||
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "ac"
|
||||
"aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ca"
|
||||
"aaa".replace "aa" "c" use_regex=True . should_equal "ca"
|
||||
"aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca"
|
||||
|
||||
"aaa aaa".replace "aa" "c" matcher=Text_Matcher.Case_Sensitive . should_equal "ca ca"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher.Case_Sensitive . should_equal "ca aaa"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "aaa ac"
|
||||
"aaa aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca ca"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca aaa"
|
||||
"aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aaa ca"
|
||||
"aaa aaa".replace "aa" "c" case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca ca"
|
||||
"aaa aaa".replace "aa" "c" only_first=True case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca aaa"
|
||||
"aaa aaa".replace "aa" "c" use_regex=True . should_equal "ca ca"
|
||||
"aaa aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca aaa"
|
||||
|
||||
Test.specify "in Regex mode should work with Unicode" <|
|
||||
"Korean: 건반".replace "건반" "keyboard" matcher=Regex_Matcher.Value . should_equal "Korean: keyboard"
|
||||
'sśs\u{301}'.replace 'ś' '-' matcher=Regex_Matcher.Value . should_equal 's--'
|
||||
'sśs\u{301}'.replace 's\u{301}' '-' matcher=Regex_Matcher.Value . should_equal 's--'
|
||||
|
||||
Test.specify "in Regex mode should support various Regex options" <|
|
||||
r1 = "İiİ".replace "\w" "a" matcher=(Regex_Matcher.Value match_ascii=True)
|
||||
r1 . should_equal "İaİ"
|
||||
r2 = "abaBa".replace "b" "a" matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive)
|
||||
r2 . should_equal "aaaaa"
|
||||
r3 = 'ab\na'.replace "b." "a" matcher=(Regex_Matcher.Value dot_matches_newline=True)
|
||||
r3 . should_equal "aaa"
|
||||
|
||||
text = """
|
||||
Foo
|
||||
bar
|
||||
r4 = text.replace '\n' "" matcher=(Regex_Matcher.Value multiline=True)
|
||||
r4 . should_equal "Foobar"
|
||||
|
||||
r5 = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" matcher=(Regex_Matcher.Value comments=True)
|
||||
r5 . should_equal "aaa"
|
||||
"Korean: 건반".replace "건반" "keyboard" use_regex=True . should_equal "Korean: keyboard"
|
||||
'sśs\u{301}'.replace 'ś' '-' use_regex=True . should_equal 's-s\u{301}'
|
||||
'sśs\u{301}'.replace 's\u{301}' '-' use_regex=True . should_equal 'sś-'
|
||||
|
||||
Test.specify "in Regex mode should allow referring to capture groups in substitutions" <|
|
||||
'<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' matcher=Regex_Matcher.Value . should_equal 'content is at url'
|
||||
'<a href="url">content</a>'.replace '<a href="(?<address>.*?)">(?<text>.*?)</a>' '${text} is at ${address}' matcher=Regex_Matcher.Value . should_equal 'content is at url'
|
||||
'<a href="url">content</a>'.replace '<a href="(.*?)">(.*?)</a>' '$2 is at $1' use_regex=True . should_equal 'content is at url'
|
||||
'<a href="url">content</a>'.replace '<a href="(?<address>.*?)">(?<text>.*?)</a>' '$<text> is at $<address>' use_regex=True . should_equal 'content is at url'
|
||||
|
||||
Test.specify "should not allow non-default locale in regex replace" <|
|
||||
locale = Locale.new "en" "GB" "UTF-8"
|
||||
'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) use_regex=True . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "should allow non-default locale in text replace" <|
|
||||
locale = Locale.new "en" "GB" "UTF-8"
|
||||
'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_equal 'b'
|
||||
|
||||
main = Test_Suite.run_main spec
|
||||
|
Loading…
Reference in New Issue
Block a user