From 6b9cbeacb2caa7606d91fda90ec3ec40acb39c70 Mon Sep 17 00:00:00 2001 From: GregoryTravis Date: Tue, 28 Mar 2023 02:13:12 -0400 Subject: [PATCH] Implement Regular Expression replace and update `Text.replace` to the new API (#5959) Re-implement replace on top of Truffle regex. --- CHANGELOG.md | 3 + .../0.0.0-dev/src/Data/Text/Extensions.enso | 168 ++--- .../Base/0.0.0-dev/src/Data/Text/Helpers.enso | 16 + .../src/Data/Text/Regex/Match_2.enso | 56 +- .../src/Data/Text/Regex/Pattern_2.enso | 141 +++- .../src/Data/Text/Regex/Replacer.enso | 144 ++++ .../Base/0.0.0-dev/src/Data/Text/Regex_2.enso | 16 +- .../Table/0.0.0-dev/src/Data/Table.enso | 30 +- .../src/Internal/Naming_Helpers.enso | 2 +- .../java/org/enso/base/Replacer_Cache.java | 51 ++ .../Table_Tests/src/In_Memory/Table_Spec.enso | 2 +- test/Tests/src/Data/Text/Regex_2_Spec.enso | 707 +++++++----------- test/Tests/src/Data/Text_Spec.enso | 221 +++--- 13 files changed, 876 insertions(+), 681 deletions(-) create mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Helpers.enso create mode 100644 distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso create mode 100644 std-bits/base/src/main/java/org/enso/base/Replacer_Cache.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 66aa4553a42..79f193b2e0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -363,6 +363,8 @@ - [Aligned names of columns created by column operations.][5850] - [Improved `cross_tab`. Renamed `fill_missing` and `is_missing` to `fill_nothing` and `is_nothing`. Added `fill_empty`.][5863] +- [Removed many regex compile flags from `replace`; added `only_first` + flag.][5959] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -550,6 +552,7 @@ [5863]: https://github.com/enso-org/enso/pull/5863 [5917]: https://github.com/enso-org/enso/pull/5917 [5705]: https://github.com/enso-org/enso/pull/5705 +[5959]: https://github.com/enso-org/enso/pull/5959 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index f6318e0fbb9..75ea00bead1 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -10,6 +10,7 @@ import project.Data.Range.Range import project.Data.Text.Case.Case import project.Data.Text.Case_Sensitivity.Case_Sensitivity import project.Data.Text.Encoding.Encoding +import project.Data.Text.Helpers import project.Data.Text.Location.Location import project.Data.Text.Matching_Mode.Matching_Mode import project.Data.Text.Regex.Match.Match @@ -218,6 +219,10 @@ Text.characters self = - case_sensitivity: Specifies if the text values should be compared case sensitively. + If an empty regex is used, `find` throws an Illegal_Argument error. + + If a non-default locale is used, `find` throws an Illegal_Argument error. + > Example Find the first substring matching the regex. @@ -227,10 +232,12 @@ Text.characters self = example_find_insensitive = ## This matches `aBc` @ character 11 "aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive -Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error +Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = - case_insensitive = case_sensitivity.is_case_insensitive_in_memory - Regex_2.compile pattern case_insensitive=case_insensitive . match self + Helpers.regex_assume_default_locale case_sensitivity <| + case_insensitive = case_sensitivity.is_case_insensitive_in_memory + compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive + compiled_pattern.if_not_error <| compiled_pattern.match self ## Finds all the matches of the regular expression `pattern` in `self`, returning a Vector. If not found, will be an empty Vector. @@ -240,6 +247,10 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = - case_sensitivity: Specifies if the text values should be compared case sensitively. + If an empty regex is used, `find_all` throws an Illegal_Argument error. + + If a non-default locale is used, `find_all` throws an Illegal_Argument error. + > Example Find the substring matching the regex. @@ -249,10 +260,12 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = example_find_all_insensitive = ## This matches `aABbbbc` @ character 0 and `aBC` @ character 11 "aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive -Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error +Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = - case_insensitive = case_sensitivity.is_case_insensitive_in_memory - Regex_2.compile pattern case_insensitive=case_insensitive . match_all self + Helpers.regex_assume_default_locale case_sensitivity <| + case_insensitive = case_sensitivity.is_case_insensitive_in_memory + compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive + compiled_pattern.if_not_error <| compiled_pattern.match_all self ## ALIAS Check Matches @@ -263,6 +276,10 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = - case_sensitivity: Specifies if the text values should be compared case sensitively. + If an empty regex is used, `match` throws an Illegal_Argument error. + + If a non-default locale is used, `match` throws an Illegal_Argument error. + > Example Checks if whole text matches a basic email regex. @@ -274,11 +291,12 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = regex = ".+ct@.+" # Evaluates to true "CONTACT@enso.org".match regex Case_Sensitivity.Insensitive -Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error +Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = - case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive - compiled_pattern.matches self + Helpers.regex_assume_default_locale case_sensitivity <| + case_insensitive = case_sensitivity.is_case_insensitive_in_memory + compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive + compiled_pattern.if_not_error <| compiled_pattern.matches self ## ALIAS Split Text @@ -327,21 +345,31 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter compiled_pattern.split self mode=Regex_Mode.All ## ALIAS Replace Text - Replaces the first, last, or all occurrences of term with new_text in the - input. If `term` is empty, the function returns the input unchanged. + Perform a text or regex replace. + + Returns the text with all matched elements replaced by the provided + replacement. If `input` is empty, the function returns the input unchanged. + + The replacement string can contain references to groups matched by the + regex. The following syntaxes are supported: + $0: the entire match string + $&: the entire match string + $n: the nth group + $: Named group `foo` Arguments: - - term: The term to find. - - new_text: The new text to replace occurrences of `term` with. - If `matcher` is a `Regex_Matcher`, `new_text` can include replacement - patterns (such as `$`) for a marked group. - - mode: Specifies which occurences of term the engine tries to find. When the - mode is `First` or `Last`, this method replaces the first or last occurence - of term in the input. If set to `All`, it replaces all occurences of term in - the input. - - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity - rules specified in the matcher. If a `Regex_Matcher`, the term is used as a - regular expression and matched using the associated options. + - term: The string or regex to find. + - replacement: The text to replace matches with. + - case_insensitive: Enables or disables case-insensitive matching. Case + insensitive matching behaves as if it normalises the case of all input + text before matching on it. + - only_first: If True, only replace the first match. + - use_regex: If true, the term is used as a regular expression. + + If an empty regex is used, `replace` throws an Illegal_Argument error. + + If a non-default locale is used with a regex, `replace` throws an + Illegal_Argument error. > Example Replace letters in the text "aaa". @@ -351,17 +379,17 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter > Example Replace all occurrences of letters 'l' and 'o' with '#'. - "Hello World!".replace "[lo]" "#" matcher=Regex_Matcher == "He### W#r#d!" + "Hello World!".replace "[lo]" "#" use_regex=True == "He### W#r#d!" > Example Replace the first occurrence of letter 'l' with '#'. - "Hello World!".replace "l" "#" mode=Matching_Mode.First == "He#lo World!" + "Hello World!".replace "l" "#" only_first=True == "He#lo World!" > Example Replace texts in quotes with parentheses. - '"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher == '(abc) foo (bar) baz' + '"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True == '(abc) foo (bar) baz' ! Matching Grapheme Clusters In case-insensitive mode, a single character can match multiple characters, @@ -378,62 +406,40 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter > Example Extended partial matches in case-insensitive mode. - # The ß symbol matches the letter `S` twice in case-insensitive mode, because it folds to `ss`. - 'ß'.replace 'S' 'A' matcher=(Text_Matcher Case_Insensitive) . should_equal 'AA' + # The ß symbol matches the letter `S` twice in case-insensitive mode, because it folds to `ss`. + 'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA' # The 'ffi' ligature is a single grapheme cluster, so even if just a part of it is matched, the whole grapheme is replaced. - 'affib'.replace 'i' 'X' matcher=(Text_Matcher Case_Insensitive) . should_equal 'aXb' - - ! Last Match in Regex Mode - Regex always performs the search from the front and matching the last - occurrence means selecting the last of the matches while still generating - matches from the beginning. Regex does not return overlapping matches - it - will return a match at some position and then continue the search after that - match. This will lead to slightly different behavior for overlapping - occurrences of a pattern in Regex mode than in exact text matching mode - where the matches are searched for from the back. + 'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb' > Example - Comparing Matching in Last Mode in Regex and Text mode + Regexp replace. - "aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "ac" - "aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "ca" + 'content'.replace '(.*?)' '$2 is at $1' use_regex=True == 'content is at url' - "aaa aaa".replace "aa" "c" matcher=Text_Matcher . should_equal "ca ca" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher . should_equal "ca aaa" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher . should_equal "aaa ac" - "aaa aaa".replace "aa" "c" matcher=Regex_Matcher . should_equal "ca ca" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher . should_equal "ca aaa" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher . should_equal "aaa ca" -Text.replace : Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Text -Text.replace self term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive = if term.is_empty then self else - case matcher of - _ : Text_Matcher -> +Text.replace : Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Text ! Illegal_Argument +Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = + case use_regex of + False -> if term.is_empty then self else array_from_single_result result = case result of Nothing -> Array.empty _ -> Array.new_1 result - spans_array = case matcher of - Text_Matcher.Case_Sensitive -> case mode of - Regex_Mode.All -> - Text_Utils.span_of_all self term - Matching_Mode.First -> - array_from_single_result <| Text_Utils.span_of self term - Matching_Mode.Last -> - array_from_single_result <| Text_Utils.last_span_of self term - _ -> Error.throw (Illegal_Argument.Error "Invalid mode.") - Text_Matcher.Case_Insensitive locale -> case mode of - Regex_Mode.All -> + spans_array = case case_sensitivity of + Case_Sensitivity.Sensitive -> case only_first of + False -> Text_Utils.span_of_all self term + True -> array_from_single_result <| Text_Utils.span_of self term + Case_Sensitivity.Insensitive locale -> case only_first of + False -> Text_Utils.span_of_all_case_insensitive self term locale.java_locale - Matching_Mode.First -> + True -> array_from_single_result <| Text_Utils.span_of_case_insensitive self term locale.java_locale False - Matching_Mode.Last -> - array_from_single_result <| - Text_Utils.span_of_case_insensitive self term locale.java_locale True - _ -> Error.throw (Illegal_Argument.Error "Invalid mode.") - Text_Utils.replace_spans self spans_array new_text - _ : Regex_Matcher -> - compiled_pattern = matcher.compile term - compiled_pattern.replace self new_text mode=mode + Text_Utils.replace_spans self spans_array replacement + True -> + Helpers.regex_assume_default_locale case_sensitivity <| + case_insensitive = case_sensitivity.is_case_insensitive_in_memory + compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive + compiled_pattern.if_not_error <| + compiled_pattern.replace self replacement only_first ## ALIAS Get Words @@ -1115,9 +1121,9 @@ Text.trim self where=Location.Both what=_.is_whitespace = term = "straße" text = "MONUMENTENSTRASSE 42" - match = text . locate term matcher=(Text_Matcher Case_Insensitive) - term.length == 6 - match.length == 7 + match = text . locate term case_sensitivity=Case_Sensitivity.Insensitive + term.length . should_equal 6 + match.length . should_equal 7 ! Matching Grapheme Clusters In case-insensitive mode, a single character can match multiple characters, @@ -1265,11 +1271,8 @@ Text.locate_all self term="" case_sensitivity=Case_Sensitivity.Sensitive = if te - term: The term to find. - start: The index to start searching from. If the index is negative, it is counted from the end of the vector. - - matcher: Specifies how the term is matched against the input: - - If a `Text_Matcher`, the text is compared using case-sensitively rules - specified in the matcher. - - If a `Regex_Matcher`, the `term` is used as a regular expression and - matched using the associated options. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. ! What is a Character? A character is defined as an Extended Grapheme Cluster, see Unicode @@ -1301,11 +1304,8 @@ Text.index_of self term="" start=0 case_sensitivity=Case_Sensitivity.Sensitive = - term: The term to find. - start: The index to start searching backwards from. If the index is negative, it is counted from the end of the vector. - - matcher: Specifies how the term is matched against the input: - - If a `Text_Matcher`, the text is compared using case-sensitively rules - specified in the matcher. - - If a `Regex_Matcher`, the `term` is used as a regular expression and - matched using the associated options. + - case_sensitivity: Specifies if the text values should be compared case + sensitively. ! What is a Character? A character is defined as an Extended Grapheme Cluster, see Unicode diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Helpers.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Helpers.enso new file mode 100644 index 00000000000..035f651441b --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Helpers.enso @@ -0,0 +1,16 @@ +from Standard.Base import all + +import project.Any.Any +import project.Data.Locale.Locale +import project.Data.Text.Case_Sensitivity.Case_Sensitivity +import project.Errors.Illegal_Argument.Illegal_Argument + +## PRIVATE +regex_assume_default_locale : Case_Sensitivity -> Any -> Any ! Illegal_Argument +regex_assume_default_locale case_sensitivity ~action = case case_sensitivity of + Case_Sensitivity.Sensitive -> action + Case_Sensitivity.Insensitive locale -> case locale == Locale.default of + True -> action + False -> + msg = "Custom locales are not supported for regexes." + Error.throw (Illegal_Argument.Error msg) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso index 4a322abfa02..8be1b068d1a 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso @@ -8,8 +8,8 @@ import project.Data.Text.Span.Span import project.Data.Text.Span.Utf_16_Span import project.Data.Text.Text import project.Data.Vector.Vector -import project.Errors.Common.Index_Out_Of_Bounds import project.Error.Error +import project.Errors.Common.Index_Out_Of_Bounds import project.Nothing.Nothing import project.Panic.Panic @@ -32,7 +32,7 @@ type Match_2 internal_start : Integer -> Integer internal_start self group = self.internal_regex_result.getStart group - ## PRIVATE + ## PRIVATE Returns the end UTF16 character index, plus one, of a group. This method goes directly to the internal match object. It does not @@ -48,9 +48,9 @@ type Match_2 Arguments: - group: the group name or number. Marked groups defined in the regex are numbered starting at 1; group 0 refers to the entire match. - utf16_start : Integer | Text -> Integer - utf16_start self group=0 = - span = self.span group + utf_16_start : Integer | Text -> Integer + utf_16_start self group=0 = + span = self.utf_16_span group if span.is_nothing then Nothing else span.start ## Returns the end UTF16 character index, plus one, of a group. @@ -58,9 +58,9 @@ type Match_2 Arguments: - group: the group name or number. Marked groups defined in the regex are numbered starting at 1; group 0 refers to the entire match. - utf16_end : Integer | Text -> Integer - utf16_end self group=0 = - span = self.span group + utf_16_end : Integer | Text -> Integer + utf_16_end self group=0 = + span = self.utf_16_span group if span.is_nothing then Nothing else span.end ## Returns the start grapheme index of a group. @@ -75,7 +75,7 @@ type Match_2 numbered starting at 1; group 0 refers to the entire match. start : Integer | Text -> Integer start self group=0 = - span = self.grapheme_span group + span = self.span group if span.is_nothing then Nothing else span.start ## Returns the end grapheme index, plus one, of a group. @@ -90,7 +90,7 @@ type Match_2 numbered starting at 1; group 0 refers to the entire match. end : Integer | Text -> Integer end self group=0 = - span = self.grapheme_span group + span = self.span group if span.is_nothing then Nothing else span.end ## Gets the UTF16 span matched by the group with the provided identifier, or @@ -120,9 +120,9 @@ type Match_2 In this case, the group id for "(d)", which is 3, is a valid group id and (Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3, - Match_2.group will return the default value. - span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group - span self group=0 ~default=Nothing = + Match_2.utf_16_span will return the default value. + utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group + utf_16_span self group=0 ~default=Nothing = group_id = self.pattern.lookup_group group start = self.internal_start group_id end = self.internal_end group_id @@ -158,10 +158,10 @@ type Match_2 In this case, the group id for "(d)", which is 3, is a valid group id and (Pattern_2.lookup_group 3) will return 3. If the caller tries to get - group 3, Match_2.group will return the default value. - grapheme_span : Integer | Text -> Any -> Span ! No_Such_Group - grapheme_span self group=0 ~default=Nothing = - result = self.span group Nothing + group 3, Match_2.span will return the default value. + span : Integer | Text -> Any -> Span ! No_Such_Group + span self group=0 ~default=Nothing = + result = self.utf_16_span group Nothing if result.is_nothing then default else result.to_grapheme_span ## Gets the Text matched by the group with the provided identifier, or @@ -186,10 +186,10 @@ type Match_2 In this case, the group id for "(d)", which is 3, is a valid group id and (Pattern_2.lookup_group 3) will return 3. If the caller tries to get - group 3, Match_2.group will return the default value. + group 3, Match_2.text will return the default value. text : Integer | Text -> Any -> Text ! No_Such_Group text self group=0 ~default=Nothing = - result = self.grapheme_span group Nothing + result = self.span group Nothing if result.is_nothing then default else result.text ## Gets a vector containing the Text of _all_ of the capturing groups in @@ -208,6 +208,16 @@ type Match_2 If the regex contained named groups, these may also be accessed by index based on their position in the pattern. + Note that it is possible for a group to "not participate in the match", + for example with a disjunction. In the example below, the "(d)" group + does not participate -- it neither matches nor fails. + + "ab((c)|(d))".find "abc" + + In this case, the group id for "(d)", which is 3, is a valid group id and + (Pattern_2.lookup_group 3) will return 3. `groups` will return the + default value for groups that do not participate. + > Example Get a vector of the text matched by all of the groups in this match, replacing the value for groups that didn't match with "UNMATCHED". @@ -237,8 +247,8 @@ type Match_2 "ab((c)|(d))".find "abc" In this case, the group id for "(d)", which is 3, is a valid group id and - (Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3, - Match_2.group will return the default value. + (Pattern_2.lookup_group 3) will return 3. `named_groups` will map + a named group that does not participate to the default value. > Example Get the map of all of the named groups in this match, replacing the @@ -261,7 +271,7 @@ type Match_2 Arguments: - id: The integer index or name of that group. - if_missing: The value to return if the index is out of bounds. - get : Integer -> Any -> Any + get : Integer -> Any -> Text | Any get self index ~if_missing=Nothing = self.text index . catch No_Such_Group (_-> if_missing) @@ -272,6 +282,6 @@ type Match_2 Arguments: - id: The integer index or name of that group. - if_missing: The value to return if the index is out of bounds. - at : Integer -> Any ! Index_Out_Of_Bounds + at : Integer -> Text ! Index_Out_Of_Bounds at self index = self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count)) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso index 134f6620b5c..2eb7205ae66 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso @@ -6,17 +6,19 @@ import project.Data.Range.Range import project.Data.Text.Span.Span import project.Data.Text.Span.Utf_16_Span import project.Data.Text.Regex.Match_2.Match_2 +import project.Data.Text.Regex.Replacer.Replacer import project.Data.Text.Regex_2.No_Such_Group import project.Data.Text.Text import project.Data.Vector.Vector import project.Error.Error +import project.Errors.Illegal_Argument.Illegal_Argument import project.Meta import project.Nothing.Nothing -import project.Panic.Panic import project.Polyglot.Polyglot from project.Data.Boolean import Boolean, True, False +polyglot java import org.enso.base.Replacer_Cache polyglot java import org.enso.base.Text_Utils type Pattern_2 @@ -50,22 +52,24 @@ type Pattern_2 ## Tries to match the provided `input` against the pattern `self`. - Returns a `Vector Match_2` objects, each containing the matched text + Returns a `Vector Match_2` object, each containing the matched text and its match groups. Arguments: - input: The text to match the pattern described by `self` against. - match_all : Text -> Vector Match_2 + match_all : Text -> Vector Match_2 ! Illegal_Argument match_all self input = - builder = Vector.new_builder - it = Match_Iterator.new self input - go it = case it.next of - Match_Iterator_Value.Next _ match next_it -> - builder.append match - go next_it - Match_Iterator_Value.Last _ -> Nothing - go it - builder.to_vector + pattern_is_empty = self.internal_regex_object.pattern == '' + if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else + builder = Vector.new_builder + it = Match_Iterator.new self input + go it = case it.next of + Match_Iterator_Value.Next _ match next_it -> + builder.append match + go next_it + Match_Iterator_Value.Last _ -> Nothing + go it + builder.to_vector ## Tries to match the provided `input` against the pattern `self`. @@ -89,6 +93,82 @@ type Pattern_2 find_all self input = self.match_all input . map match_to_group_maybe + ## ADVANCED + + Replace all occurrences of the pattern described by `self` in the `input` + with the specified `replacement`. + + Arguments: + - input: The text in which to perform the replacement(s). + - replacement: The literal text with which to replace any matches. + - only_first: If True, only replace the first match. + + If this method performs no replacements it will return the `input` text + unchanged. + + The replacement string can contain references to groups matched by the + regex. The following syntaxes are supported: + $0: the entire match string + $&: the entire match string + $n: the nth group + $: Named group `foo` + + > Example + Replace letters in the text "aa". + + pattern = Regex_2.compile 'aa' + pattern.replace 'aaa' 'b' == 'ba' + + > Example + Replace all occurrences of letters 'l' and 'o' with '#'. + + pattern = Regex_2.compile '[lo]' + pattern.replace 'Hello World!' '#' == 'He### W#r#d!' + + > Example + Replace the first occurrence of letter 'l' with '#'. + + pattern = Regex_2.compile 'l' + pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!' + + > Example + Replace texts in quotes with parentheses. + + pattern = Regex_2.compile '"(.*?)"' + pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz' + + > Example + Replace a literal string with a replacement value. + + pattern = Regex_2.compile "aa" + input = "aa ab aa ac ad aa aa ax" + match = pattern.replace input "xyz" + match == "xyz ab xyz ac ad xyz xyz ax" + + > Example + Replace each word with the same word surrounded by `[]`. + + pattern = Regex_2.compile "([a-z]+)" + pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]" + + replace : Text -> Text -> Boolean -> Text + replace self input replacement only_first=False = + it = Match_Iterator.new self input + case it of + Match_Iterator_Value.Last filler -> filler.text + _ -> + replacer = Replacer.new replacement self + + replacer.if_not_error <| + go next current = case next of + Match_Iterator_Value.Next filler match next_it -> + new_value = current + filler.text + (replacer.replace match) + next = if only_first then next_it.early_exit else next_it.next + @Tail_Call go next new_value + Match_Iterator_Value.Last filler -> + current + filler.text + go it.next "" + ## PRIVATE Look up a match group name or number, and check that it is valid. @@ -106,6 +186,9 @@ type Pattern_2 A group name is an alias for a group number; if a name is passed to this method, it returns the corresponding group number. + If a group number is passed to `lookup_group` and it is valid, it will + simply return the group number. + Note that it is possible for a group to "not participate in the match", for example with a disjunction. In the example below, the "(d)" group does not participate -- it neither matches nor fails. @@ -138,6 +221,15 @@ type Pattern_2 _ : Integer -> n Nothing -> Error.throw (No_Such_Group.Error name) + ## PRIVATE + + Return a lazy iterator over matches against a string. + + Arguments + - text: the string to match against. + iterator : Text -> Match_Iterator + iterator self input = Match_Iterator.new self input + ## Return the number of groups in the underlying RegexObject. Note, the count includes group 0 (the whole match) as well. group_count : Integer @@ -154,32 +246,51 @@ type Pattern_2 Performs the regex match, and iterates through the results. Yields both the matched parts of the string, and the 'filler' parts between them. + The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because + matches and replacement boundaries can fall in the middle of multi- + character graphemes, thereby splitting them apart. + At each step, it yields a Match_Iterator_Value, whivch has either a filler and a match, or just the final filler. A Match_Iterator_Value.Last value is return at the end, and only at the end. + + Optionally, you can call `early_exit` to have it return the remainder of + the string, unmatched, as a single Last value. (Used for `replace` with + `only_first=True`.) type Match_Iterator new : Pattern_2 -> Text -> Match_Iterator new pattern input = Match_Iterator.Value pattern input 0 Value (pattern : Pattern_2) (input : Text) (cursor : Integer) + ## Return the next match, or the last filler string if there is no + additional match. + + Also returns the next iterator, if there was a match. next : Match_Iterator_Value next self = regex_result = self.pattern.internal_regex_object.exec self.input self.cursor case regex_result.isMatch of False -> filler_range = Range.new self.cursor (Text_Utils.char_length self.input) - filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span + filler_span = (Utf_16_Span.Value filler_range self.input) Match_Iterator_Value.Last filler_span True -> match_start = regex_result.getStart 0 filler_range = Range.new self.cursor match_start - filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span + filler_span = (Utf_16_Span.Value filler_range self.input) match = Match_2.Value self.pattern regex_result self.input - next_cursor = match.utf16_end 0 + next_cursor = match.utf_16_end 0 next_iterator = Match_Iterator.Value self.pattern self.input next_cursor Match_Iterator_Value.Next filler_span match next_iterator + ## Returns the remainder of the string, unmatched. + early_exit : Match_Iterator_Value + early_exit self = + filler_range = Range.new self.cursor (Text_Utils.char_length self.input) + filler_span = Utf_16_Span.Value filler_range self.input + Match_Iterator_Value.Last filler_span + to_text_debug : Vector Text to_text_debug self = vb = Vector.new_builder diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso new file mode 100644 index 00000000000..d840c654b2a --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso @@ -0,0 +1,144 @@ +import project.Data.Numbers.Integer +import project.Data.Text.Extensions +import project.Data.Text.Regex.Match_2.Match_2 +import project.Data.Text.Regex.Pattern_2.Match_Iterator_Value +import project.Data.Text.Regex.Pattern_2.Pattern_2 +import project.Data.Text.Regex_2 +import project.Data.Text.Regex_2.No_Such_Group +import project.Data.Text.Span.Utf_16_Span +import project.Data.Text.Text +import project.Data.Vector.Vector +import project.Error.Error +import project.Errors.Illegal_State.Illegal_State +import project.Nothing.Nothing +import project.Panic.Panic + +from project.Data.Boolean import Boolean, True, False + +polyglot java import java.lang.StringBuilder +polyglot java import org.enso.base.Replacer_Cache + +type Replacer + ## PRIVATE + + Implements a replacement for a regular expression. + + Pattern_2.replace uses a Replacer to replace each regex match with + a replacement string. This string can contain references to match + groups from the original regex. + + The `new` smart constructor parses a Text into a vector of + Replacements. Each Replacement is either a literal string or a + group number. To provide a replacement for a regex match, the + Replacer iterates through the Replacement vector, substitutes + the match group contents for each group number, and concatenates + all the strings together to form the full replacement string. + Value (replacement : Vector Replacement) + + ## Creates a new Replacer. + + Arguments + - replacement_string: a string, possibly containing group references, + that will be used to provide a replacement in a regex match. + new : Text -> Pattern_2 -> Replacer ! No_Such_Group + new replacement_string pattern = + Replacer.Value (build_replacement_vector_cached replacement_string pattern) + + ## Build a replacement string from a match. + + Arguments: + - match: the match from the original string that is to be replaced. + replace : Match_2 -> Text + replace self match = + string_builder = StringBuilder.new + self.replacement.each replacement-> + s = case replacement of + Replacement.Literal text -> text + Replacement.Substitution group_number -> match.text group_number + string_builder.append s + string_builder.toString + +## PRIVATE + + Get the size of the Replacer LRU cache. For testing. +get_lru_size : Integer +get_lru_size = Replacer_Cache.getLruSize + +## PRIVATE + + Look up a replacement string in the Replacer LRU cache. For testing. +replacer_cache_lookup : Text -> Replacer | Nothing +replacer_cache_lookup replacement_string = Replacer_Cache.get replacement_string + +## PRIVATE +group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))" + +## PRIVATE + + Build a replacement vector. + + Parse the replacement string into an alternating series of literal + strings and group reference numbers. + + Uses Replacement_Cache to avoid rebuilding the vector for recently used + replacement strings. +build_replacement_vector_cached : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group +build_replacement_vector_cached replacement_string pattern = + Replacer_Cache.get_or_set replacement_string _-> + build_replacement_vector replacement_string pattern + +## PRIVATE + + Build a replacement vector. + + Parse the replacement string into an alternating series of literal + strings and group reference numbers. +build_replacement_vector : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group +build_replacement_vector replacement_string pattern = + replacement_pattern = Regex_2.compile group_reference_regex + it = replacement_pattern.iterator replacement_string + + builder = Vector.new_builder + go it = case it.next of + Match_Iterator_Value.Next filler match next_it -> + replacement = parse_group_number pattern match + replacement.if_not_error <| + builder.append (Replacement.Literal filler.text) + builder.append replacement + @Tail_Call go next_it + Match_Iterator_Value.Last filler -> + builder.append (Replacement.Literal filler.text) + result = go it + result.if_not_error <| + builder.to_vector + +## PRIVATE + + Parse a capture group reference. + + Arguments: + - pattern: the Pattern_2 used to initiate the replacement. This is used + to identify and validate capture groups. + - match: the match of the replacement string against group_reference_regex. + + Returns a Replacement: a group number, or, in the case of `$$`, a literal. + + See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions +parse_group_number : Pattern_2 -> Match_2 -> Replacement ! No_Such_Group +parse_group_number pattern match = case match.text.take 2 of + "$$" -> Replacement.Literal "$" + "$<" -> + # Group 6 contains the group name without the `<>`. + group_name = match.text 6 + Replacement.Substitution (pattern.lookup_group group_name) + "$&" -> Replacement.Substitution 0 + _ -> + n = Integer.parse <| match.text 2 + Replacement.Substitution (pattern.lookup_group n) + +type Replacement + ## A string literal to replace with. + Literal (text : Text) + + ## Target group to insert. + Substitution (group_number : Integer) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso index 2d47fc4b144..9a02dc9a4cd 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso @@ -7,6 +7,7 @@ import project.Error.Error import project.Errors.Illegal_Argument.Illegal_Argument import project.Nothing.Nothing import project.Panic.Panic + from project.Data.Boolean import Boolean, True, False from project.Errors.Common import Syntax_Error @@ -17,24 +18,27 @@ polyglot java import java.util.regex.Pattern as Java_Pattern Arguments - expression: The text representing the regular expression that you want to - compile. + compile. Must be non-empty. - case_insensitive: Enables or disables case-insensitive matching. Case insensitive matching behaves as if it normalises the case of all input text before matching on it. + If an empty regex is used, `compile` throws an Illegal_Argument error. + ? Why Compile? While many regex engines are able to cache ad-hoc patterns, it is often useful to be able to manually retain a pattern that you have computed. This function exists so you can hold onto the resultant `Pattern_2` object, instead of immediately proceeding to match using it. -compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error +compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error | Illegal_Argument compile self expression case_insensitive=Nothing = - options_string = if case_insensitive == True then "usgi" else "usg" + if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else + options_string = if case_insensitive == True then "usgi" else "usg" - internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic-> - Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message)) + internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic-> + Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message)) - Pattern_2.Value internal_regex_object + Pattern_2.Value internal_regex_object ## ADVANCED diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso index ade0ca48a0b..9d10a741f89 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso @@ -843,8 +843,8 @@ type Table parse_problem_builder.attach_problems_before on_problems <| Table.new new_columns - ## Replaces the first, last, or all occurrences of `term` with - `new_text` in each text row of selected columns. + ## Replaces the first, or all occurrences of `term` with `new_text` in each + text row of selected columns. If `term` is empty, the function returns the table unchanged. This method follows the exact replacement semantics of the @@ -854,15 +854,13 @@ type Table - columns: Column selection criteria or a column name or index. - term: The term to find. - new_text: The new text to replace occurrences of `term` with. - If `matcher` is a `Regex_Matcher`, `new_text` can include replacement - patterns (such as `$`) for a marked group. - - mode: Specifies which occurences of term the engine tries to find. When the - mode is `First` or `Last`, this method replaces the first or last occurence - of term in each individual table cell. If set to `All`, it replaces all - occurences of term. - - matcher: If a `Text_Matcher`, the text is compared using case-sensitivity - rules specified in the matcher. If a `Regex_Matcher`, the term is used as a - regular expression and matched using the associated options. + If use_regex is true, `new_text` can include replacement patterns + (such as `$`) for a marked group. + - case_insensitive: Enables or disables case-insensitive matching. Case + insensitive matching behaves as if it normalises the case of all input + text before matching on it. + - only_first: If True, only replace the first match. + - use_regex: If true, the term is used as a regular expression. - on_problems: Specifies how to handle if a problem occurs, raising as a warning by default. @@ -881,21 +879,21 @@ type Table > Example Remove leading and trailing spaces from cells in multiple columns. - table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" matcher=Regex_Matcher.Value + table.replace_text By_Name ["foo", "bar"] "^\s*(.*?)\s*$" "$1" use_regex=True > Example Replace texts in quotes with parentheses in column at index 1. - table.replace_text 1 '"(.*?)"' '($1)' matcher=Regex_Matcher.Value - replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Matching_Mode | Regex_Mode -> (Text_Matcher | Regex_Matcher) -> Problem_Behavior -> Table - replace_text self columns=[0] term="" new_text="" mode=Regex_Mode.All matcher=Text_Matcher.Case_Sensitive on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else + table.replace_text 1 '"(.*?)"' '($1)' use_regex=True + replace_text : Text | Integer | Column_Selector | Vector (Integer | Text | Column_Selector) -> Text -> Text -> Case_Sensitivity -> Boolean -> Boolean -> Problem_Behavior -> Table + replace_text self columns=[0] term="" new_text="" case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False on_problems=Problem_Behavior.Report_Warning = if term.is_empty then self else problem_builder = Problem_Builder.new selection = self.columns_helper.select_columns_helper columns reorder=False problem_builder selected_names = Map.from_vector (selection.map column-> [column.name, True]) map_preserve_name column f = column.map f . rename column.name - do_replace = _.replace term new_text mode matcher + do_replace = _.replace term new_text case_sensitivity=case_sensitivity only_first=only_first use_regex=use_regex do_replace_only_text = case _ of item : Text -> do_replace item item -> item diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Naming_Helpers.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Naming_Helpers.enso index ac5fa4615e0..c4ff1ecb201 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Naming_Helpers.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Naming_Helpers.enso @@ -18,7 +18,7 @@ type Naming_Helpers sanitize_name : Text -> Text sanitize_name name = # Using the regex matcher due to the #5831 bug. - name.replace '\0' '\\\\0' matcher=Regex_Matcher.Value + name.replace '\0' '\\0' use_regex=True ## PRIVATE Generates a column name for a binary operation. diff --git a/std-bits/base/src/main/java/org/enso/base/Replacer_Cache.java b/std-bits/base/src/main/java/org/enso/base/Replacer_Cache.java new file mode 100644 index 00000000000..f7f202a607c --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/Replacer_Cache.java @@ -0,0 +1,51 @@ +package org.enso.base; + +import org.graalvm.collections.Pair; +import org.graalvm.polyglot.Value; + +import java.util.ArrayList; +import java.util.function.Function; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class Replacer_Cache { + private static final int lruSize = 5; + + // Circular buffer containing the most recent cache keys. + private static final List> lru = new ArrayList<>(lruSize); + + static { + for (int i = 0; i < lruSize; ++i) { + lru.add(null); + } + } + + // Index into the circular buffer. + private static int nextSlot = 0; + + public static Value get_or_set(String key, Function value_producer) { + Value value = get(key); + if (value == null) { + value = value_producer.apply(null); + lru.set(nextSlot, Pair.create(key, value)); + nextSlot = (nextSlot + 1) % lruSize; + } + return value; + } + + // Visible for testing. + public static Value get(String key) { + for (int i = 0; i < lruSize; ++i) { + Pair pair = lru.get(i); + if (pair != null && pair.getLeft().equals(key)) { + return lru.get(i).getRight(); + } + } + return null; + } + + public static int getLruSize() { + return lruSize; + } +} diff --git a/test/Table_Tests/src/In_Memory/Table_Spec.enso b/test/Table_Tests/src/In_Memory/Table_Spec.enso index a9160c916b6..9c1763d6845 100644 --- a/test/Table_Tests/src/In_Memory/Table_Spec.enso +++ b/test/Table_Tests/src/In_Memory/Table_Spec.enso @@ -644,7 +644,7 @@ spec = bools = ["bools", [False, False, True, True]] texts = ["texts", ["foo", "bar", "baz", "spam"]] table = Table.new [bools, texts] - actual = table.replace_text "texts" "(a|o)" "$1e" matcher=Regex_Matcher.Value + actual = table.replace_text "texts" "(a|o)" "$1e" use_regex=True actual.at "texts" . to_vector . should_equal ["foeoe", "baer", "baez", "spaem"] Problems.assume_no_problems actual diff --git a/test/Tests/src/Data/Text/Regex_2_Spec.enso b/test/Tests/src/Data/Text/Regex_2_Spec.enso index e8e2df79514..09856de1a85 100644 --- a/test/Tests/src/Data/Text/Regex_2_Spec.enso +++ b/test/Tests/src/Data/Text/Regex_2_Spec.enso @@ -1,199 +1,98 @@ from Standard.Base import all +import Standard.Base.Data.Text.Span.Span import Standard.Base.Data.Text.Span.Utf_16_Span -import Standard.Base.Data.Text.Regex_2 import Standard.Base.Data.Text.Regex.Match_2.Match_2 +import Standard.Base.Data.Text.Regex.Pattern_2.Pattern_2 +import Standard.Base.Data.Text.Regex.Replacer.Replacer +import Standard.Base.Data.Text.Regex_2 import Standard.Base.Data.Text.Regex_2.No_Such_Group import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error +import Standard.Base.Errors.Illegal_Argument.Illegal_Argument + +from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup from Standard.Test import Test, Test_Suite import Standard.Test.Extensions -# default_mask = Java_Pattern.CANON_EQ.bit_or Java_Pattern.UNICODE_CASE . bit_or Java_Pattern.UNICODE_CHARACTER_CLASS +polyglot java import org.enso.base.Replacer_Cache spec = - ## - Test.group "The default regex engine's options handling" <| + Test.group "Compile" <| + Test.specify "should be able to be compiled" <| + pattern = Regex_2.compile "(?..)" case_insensitive=True + pattern . should_be_a Pattern_2 - Test.specify "should convert options to Java" <| - options = [Regex_Option.Comments, Regex_Option.Multiline, Default_Engine.Option.Unix_Lines] - expected_mask = Java_Pattern.UNIX_LINES.bit_or Java_Pattern.COMMENTS . bit_or Java_Pattern.MULTILINE . bit_or default_mask - actual_mask = Default_Engine.from_enso_options options + Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <| + Regex_2.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error - actual_mask . should_equal expected_mask + Test.specify "should disallow empty patterns in `compile`" <| + Regex_2.compile "" . should_fail_with Illegal_Argument - Test.specify "should specify the unicode options by default" <| - actual_mask = Default_Engine.from_enso_options [] + Test.group "Escape" <| + Test.specify "should escape an expression for use as a literal" <| + pattern = "http://example.com" + Regex_2.escape pattern . should_equal "\Qhttp://example.com\E" - actual_mask . should_equal default_mask + Test.group "Pattern.matches" <| + Test.specify "should return True when the pattern matches against the input" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + pattern.matches input . should_be_true - Test.specify "should handle ascii matching by disabling unicode" <| - actual_mask = Default_Engine.from_enso_options [Regex_Option.Ascii_Matching] - actual_mask . should_equal 0 + Test.specify "should return False when the pattern doesn't match against the input" <| + pattern = Regex_2.compile "aaz" + input = "aa ab abc a bc bcd" + pattern.matches input . should_be_false - Test.specify "should result in an error when an option is invalid" <| - Default_Engine.from_enso_options [""] . should_fail_with Invalid_Option - Default_Engine.from_enso_options ["", Regex_Option.Ascii_Matching] . should_fail_with Invalid_Option + Test.specify "should check for full matches" <| + pattern = Regex_2.compile "f.o" + pattern.matches "foo" . should_be_true + pattern.matches "foobar" . should_be_false - Test.group "The default regex engine (Default_Engine)" <| + Test.specify "`matches` with an empty pattern should be an error" <| + pattern = Regex_2.compile "" + pattern.matches "ABC" . should_fail_with Illegal_Argument - Test.specify "should be able to compile patterns with no options" <| - engine = Default_Engine.new - pattern = engine.compile "^a$" [] - pattern.engine . should_equal engine - pattern.options . should_equal [] - pattern.internal_pattern.flags . should_equal default_mask + Test.group "Pattern.match" <| + Test.specify "should be able to `match` the first instance of the pattern in the input" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 + match.text 0 . should_equal input - Test.specify "should be able to compile patterns with global options" <| - engine = Default_Engine.new - pattern = engine.compile "^a$" [Regex_Option.Multiline] - pattern.engine . should_equal engine - pattern.options . should_equal [Regex_Option.Multiline] - pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.MULTILINE) + Test.specify "should return `Nothing` if there are no matches in first mode" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "abc" + match = pattern.match input + match . should_equal Nothing - Test.specify "should be able to compile patterns with engine-specific options" <| - engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern] - pattern = engine.compile "^a$" [] - pattern.engine . should_equal engine - pattern.options . should_equal [Default_Engine.Option.Literal_Pattern] - pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.LITERAL) + Test.specify "should be able to `match` the all instances of the pattern in the input" <| + pattern = Regex_2.compile "(..)" + input = "abcdefghij" + matches = pattern.match_all input + matches.length . should_equal 5 + matches.at 0 . text 0 . should_equal "ab" + matches.at 1 . text 0 . should_equal "cd" + matches.at 2 . text 0 . should_equal "ef" + matches.at 3 . text 0 . should_equal "gh" + matches.at 4 . text 0 . should_equal "ij" - Test.specify "should be able to compile patterns with combined options" <| - engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern] - pattern = engine.compile "^a$" [Regex_Option.Comments] - pattern.engine . should_equal engine - pattern.options.contains Default_Engine.Option.Literal_Pattern . should_be_true - pattern.options.contains Regex_Option.Comments . should_be_true - pattern.internal_pattern.flags . should_equal (default_mask . bit_or Java_Pattern.LITERAL . bit_or Java_Pattern.COMMENTS) + Test.specify "should return `[]` when an all match match fails" <| + pattern = Regex_2.compile "(aa)" + input = "abcdefghij" + match = pattern.match_all input + match . should_equal [] - Test.specify "should return a syntax error of the regex syntax is invalid" <| - engine = Default_Engine.new - engine.compile "^(a" [] . should_fail_with Syntax_Error + Test.specify "`match` with an empty pattern should be an error" <| + pattern = Regex_2.compile "" + pattern.match "ABC" . should_fail_with Illegal_Argument - Test.specify "should throw an invalid options error if an option is invalid" <| - engine = Default_Engine.new - engine.compile "^a$" ["invalid"] . should_fail_with Invalid_Option + Test.specify "`match_all` with an empty pattern should be an error" <| + pattern = Regex_2.compile "" + pattern.match_all "ABC" . should_fail_with Illegal_Argument - Test.specify "should escape an expression for use as a literal" <| - pattern = "http://example.com" - engine = Default_Engine.new - engine.escape pattern . should_equal "\Qhttp://example.com\E" - - Test.group "The default regex engine's Pattern.matches" <| - engine = Default_Engine.new - - Test.specify "should return True when the pattern matches against the input" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - pattern.matches input . should_be_true - - Test.specify "should return False when the pattern doesn't match against the input" <| - pattern = engine.compile "aaz" [] - input = "aa ab abc a bc bcd" - pattern.matches input . should_be_false - - Test.specify "should check for full matches" <| - pattern = engine.compile "f.o" [] - pattern.matches "foo" . should_be_true - pattern.matches "foobar" . should_be_false - - Test.group "The default regex engine's Pattern.match" <| - engine = Default_Engine.new - - Test.specify "should be able to `match` the first instance of the pattern in the input" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value - match.text 0 . should_equal input - - Test.specify "should return `Nothing` if there are no matches in first mode" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "abc" - match = pattern.match input mode=Matching_Mode.First - match . should_equal Nothing - - Test.specify "should be able to `match` at most N instances of the pattern in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.match input mode=3 - match.length . should_equal 3 - match.at 0 . group 0 . should_equal "ab" - match.at 1 . group 0 . should_equal "cd" - match.at 2 . group 0 . should_equal "ef" - - Test.specify "should `match` fewer than N instances when there are fewer than N in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdef" - match = pattern.match input mode=5 - match.length . should_equal 3 - match.at 0 . group 0 . should_equal "ab" - match.at 1 . group 0 . should_equal "cd" - match.at 2 . group 0 . should_equal "ef" - - Test.specify "should return `Nothing` when a counted match fails" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.match input mode=3 - match . should_equal Nothing - - Test.specify "should be able to `match` the all instances of the pattern in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.match input mode=Regex_Mode.All - match.length . should_equal 5 - match.at 0 . group 0 . should_equal "ab" - match.at 1 . group 0 . should_equal "cd" - match.at 2 . group 0 . should_equal "ef" - match.at 3 . group 0 . should_equal "gh" - match.at 4 . group 0 . should_equal "ij" - - Test.specify "should return `Nothing` when an all match match fails" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.match input mode=Regex_Mode.All - match . should_equal Nothing - - Test.specify "should be able to `match` the pattern against the entire input" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Regex_Mode.Full - match . should_be_a Default_Engine.Match.Value - match.text 0 . should_equal input - - Test.specify "should return `Nothing` if a full match does not match the entire input" <| - pattern = engine.compile "(..)" [] - input = "aa ab" - full_match = pattern.match input mode=Regex_Mode.Full - full_match . should_equal Nothing - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value - - Test.specify "should be able to `match` the pattern against bounded input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.match input mode=(Regex_Mode.Bounded 2 8) - match.length . should_equal 3 - match.at 0 . text 0 . should_equal "cd" - match.at 1 . text 0 . should_equal "ef" - match.at 2 . text 0 . should_equal "gh" - - Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <| - pattern = engine.compile "" [] - match_1 = pattern.match "" mode=Regex_Mode.All - match_1.length . should_equal 1 - match_1.at 0 . start 0 . should_equal 0 - match_1.at 0 . end 0 . should_equal 0 - - match_2 = pattern.match "ABC" mode=Regex_Mode.All - match_2.length . should_equal 4 - match_2.at 0 . start 0 . should_equal 0 - match_2.at 0 . end 0 . should_equal 0 - match_2.at 1 . start 0 . should_equal 1 - match_2.at 1 . end 0 . should_equal 1 - match_2.at 3 . start 0 . should_equal 3 - match_2.at 3 . end 0 . should_equal 3 - - Test.group "The default regex engine's Pattern.find" <| + Test.group "Pattern_2.find and .find_all" <| Test.specify "should be able to `find` the first instance of the pattern in the input" <| pattern = Regex_2.compile "(..)" input = "abcdefghij" @@ -229,6 +128,14 @@ spec = Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"] Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"] + Test.specify "`find` with an empty pattern should be an error" <| + pattern = Regex_2.compile "" + pattern.find "ABC" . should_fail_with Illegal_Argument + + Test.specify "`find_all` with an empty pattern should be an error" <| + pattern = Regex_2.compile "" + pattern.find_all "ABC" . should_fail_with Illegal_Argument + ## Test.group "The default regex engine's Pattern.split" <| engine = Default_Engine.new @@ -279,152 +186,132 @@ spec = match.at 3 . should_equal "e" match.at 4 . should_equal "f" - Test.group "The default regex engine's Pattern.replace" <| - engine = Default_Engine.new + Test.group "Pattern_2.replace" <| + Test.specify "should be able to `replace` the first instance of the pattern in the input" <| + pattern = Regex_2.compile "abc" + input = "aa ab abc a bc abc" + match = pattern.replace input "REPLACED" only_first=True + match . should_be_a Text + match . should_equal "aa ab REPLACED a bc abc" - Test.specify "should be able to `replace` the first instance of the pattern in the input" <| - pattern = engine.compile "abc" [] - input = "aa ab abc a bc abc" - match = pattern.replace input "REPLACED" mode=Matching_Mode.First - match . should_be_a Text - match . should_equal "aa ab REPLACED a bc abc" + Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <| + pattern = Regex_2.compile "xyz" + input = "aa ab ac ad" + match = pattern.replace input "REPLACED" only_first=True + match . should_equal input - Test.specify "should return the string unchanged if there are no matches to replace in first mode" <| - pattern = engine.compile "xyz" [] - input = "aa ab ac ad" - match = pattern.replace input "REPLACED" mode=Matching_Mode.First - match . should_equal input + Test.specify "should be able to replace the all instances of the pattern in the input" <| + pattern = Regex_2.compile "aa" + input = "aa ab aa ac ad aa aa ax" + match = pattern.replace input "REPLACED" + match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" - Test.specify "should be able to replace at most N instances of the pattern in the input" <| - pattern = engine.compile "aa" [] - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "REPLACED" mode=3 - match . should_equal "REPLACED ab REPLACED ac ad REPLACED aa ax" + Test.specify "should return the input when an all replace fails" <| + pattern = Regex_2.compile "aa" + input = "abcdefghij" + match = pattern.replace input "REPLACED" + match . should_equal input - Test.specify "should replace fewer than N instances when there are fewer than N in the input" <| - pattern = engine.compile "aa" [] - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "REPLACED" mode=10 - match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" - - Test.specify "should return the input when a counted replace fails" <| - pattern = engine.compile "aa" [] - input = "abcdefghij" - match = pattern.replace input "REPLACED" mode=3 - match . should_equal input - - Test.specify "should be able to replace the all instances of the pattern in the input" <| - pattern = engine.compile "aa" [] - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "REPLACED" mode=Regex_Mode.All - match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" - - Test.specify "should return the input when an all replace fails" <| - pattern = engine.compile "aa" [] - input = "abcdefghij" - match = pattern.replace input "REPLACED" mode=Regex_Mode.All - match . should_equal input - - Test.specify "should be able to replace the entire input only if it matches" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.replace input "REPLACED" mode=Regex_Mode.Full - match . should_equal "REPLACED" - - Test.specify "should correctly replace entire input in Full mode even if partial matches are possible" <| - pattern = engine.compile "(aa)+" [] - pattern.replace "aaa" "REPLACED" mode=Regex_Mode.Full . should_equal "aaa" - pattern.replace "aaaa" "REPLACED" mode=Regex_Mode.Full . should_equal "REPLACED" - - Test.specify "should return the input for a full replace if the pattern doesn't match the entire input" <| - pattern = engine.compile "(..)" [] - input = "aa ab" - full_match = pattern.replace input "REPLACED" mode=Regex_Mode.Full - full_match . should_equal input - - Test.specify "should not perform overlapping replacements in counted mode" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - result = pattern.replace input "REPLACED" mode=3 - result . should_equal "REPLACEDREPLACEDREPLACEDghij" - - Test.specify "should not perform overlapping replacements in all mode" <| - pattern = engine.compile "(..)" [] - input = "aa ab" - match = pattern.replace input "REPLACED" mode=Regex_Mode.All - match . should_equal "REPLACEDREPLACEDb" - - Test.specify "should handle capture groups in replacement" <| - pattern = engine.compile "(?[a-z]+)" [] - pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$1]" mode=0 . should_equal "foo bar, baz" - pattern.replace "foo bar, baz" "[$1]" mode=1 . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[$1]" mode=2 . should_equal "[foo] [bar], baz" - pattern.replace "foo bar, baz" "[$1]" mode=3 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$1]" mode=4 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.First . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]" - - pattern.replace "foo bar, baz" "[${capture}]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[${capture}]" mode=0 . should_equal "foo bar, baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=1 . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=2 . should_equal "[foo] [bar], baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=3 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[${capture}]" mode=4 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.First . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]" - - Test.specify "should handle capture groups in replacement in All mode" <| - pattern = engine.compile "([a-z]+)" [] - pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.Full . should_equal "foo bar, baz" - pattern.replace "foo" "[$1]" mode=Regex_Mode.Full . should_equal "[foo]" - - pattern_2 = engine.compile '(?.*?)' [] - pattern_2.replace 'content' "$2 <- $1" mode=Regex_Mode.Full . should_equal "content <- url" - pattern_2.replace 'content' "${name} <- ${addr}" mode=Regex_Mode.Full . should_equal "content <- url" - - Test.group "Match.group" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] + Test.specify "should be able to replace the entire input only if it matches" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value + match = pattern.replace input "REPLACED" + match . should_equal "REPLACED" - Test.specify "should return the full match with index 0" <| - match.group 0 . should_equal "aa ab abc a bc bcd" + Test.specify "should not perform overlapping replacements in all mode" <| + pattern = Regex_2.compile "(..)" + input = "aa ab" + match = pattern.replace input "REPLACED" + match . should_equal "REPLACEDREPLACEDb" - Test.specify "should return the group contents if it matches by index" <| - match.group 1 . should_equal "aa ab " + Test.specify "should handle capture groups in replacement" <| + pattern = Regex_2.compile "(?[a-z]+)" + pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz" - Test.specify "should return the group contents if it matches by name" <| - match.group "letters" . should_equal "abc a bc bcd" + pattern.replace "foo bar, baz" "[$]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$]" only_first=True . should_equal "[foo] bar, baz" - Test.specify "should return Nothing if the group did not match" <| - match.group 3 . should_equal Nothing + pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz" + pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz" - Test.specify "should fail with No_Such_Group_Error if the group did not exist" <| - match.group "fail" . should_fail_with No_Such_Group - match.group 5 . should_fail_with No_Such_Group + Test.specify "should handle unicode in capture group names" <| + pattern = Regex_2.compile "(?<건반>[a-z]+)" + pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]" - Test.specify "should make named groups accessible by index" <| - match.group 2 . should_equal (match.group "letters") + Text.group "should correctly evaluate documentation examples" <| + Test.specify "example 1" <| + pattern = Regex_2.compile 'aa' + pattern.replace 'aaa' 'b' . should_equal 'ba' - Test.group "Match.groups" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value + Test.specify "example 2" <| + pattern = Regex_2.compile '[lo]' + pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!' - Test.specify "should return the results of all groups" <| - groups = match.groups - groups.length . should_equal 5 - groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing] + Test.specify "example 3" <| + pattern = Regex_2.compile 'l' + pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!' - Test.specify "should replace unmatched groups by a user-specified value" <| - groups = match.groups "UNMATCHED" - groups.length . should_equal 5 - groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"] + Test.specify "example 4" <| + pattern = Regex_2.compile '"(.*?)"' + pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz' + + Test.specify "example 5" <| + pattern = Regex_2.compile "aa" + input = "aa ab aa ac ad aa aa ax" + match = pattern.replace input "xyz" + match . should_equal "xyz ab xyz ac ad xyz xyz ax" + + Test.specify "example 6" <| + pattern = Regex_2.compile "([a-z]+)" + pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]" + + Test.specify "`replace` with an empty pattern should be an error" <| + pattern = Regex_2.compile "" + pattern.replace "ABC" . should_fail_with Illegal_Argument + + Test.group "Match.text" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 + + Test.specify "should return the full match with index 0" <| + match.text 0 . should_equal "aa ab abc a bc bcd" + + Test.specify "should return the group contents if it matches by index" <| + match.text 1 . should_equal "aa ab " + + Test.specify "should return the group contents if it matches by name" <| + match.text "letters" . should_equal "abc a bc bcd" + + Test.specify "should return Nothing if the group did not match" <| + match.text 3 . should_equal Nothing + + Test.specify "should fail with No_Such_Group_Error if the group did not exist" <| + match.text "fail" . should_fail_with No_Such_Group + match.text 5 . should_fail_with No_Such_Group + + Test.specify "should make named groups accessible by index" <| + match.text 2 . should_equal (match.text "letters") + + Test.group "Match.groups" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 + + Test.specify "should return the results of all groups" <| + groups = match.groups + groups.length . should_equal 5 + groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing] + + Test.specify "should replace unmatched groups by a user-specified value" <| + groups = match.groups "UNMATCHED" + groups.length . should_equal 5 + groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"] Test.group "Match.named_groups" <| pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" @@ -445,147 +332,135 @@ spec = groups.at "letters" . should_equal "abc a bc bcd" groups.at "empty" . should_equal "UNMATCHED" - Test.group "Match.start" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 + Test.group "Match.start" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 - Test.specify "should return the start of a group by index" <| - match.start 1 . should_equal 0 + Test.specify "should return the start of a group by index" <| + match.start 1 . should_equal 0 - Test.specify "should return the start of a group by name" <| - match.start "letters" . should_equal 6 + Test.specify "should return the start of a group by name" <| + match.start "letters" . should_equal 6 - Test.specify "should return Nothing if the group didn't match" <| - match.start 3 . should_equal Nothing - match.start "empty" . should_equal Nothing + Test.specify "should return Nothing if the group didn't match" <| + match.start 3 . should_equal Nothing + match.start "empty" . should_equal Nothing - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.start 5 . should_fail_with No_Such_Group - match.start "nonexistent" . should_fail_with No_Such_Group + Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| + match.start 5 . should_fail_with No_Such_Group + match.start "nonexistent" . should_fail_with No_Such_Group - Test.group "Match.end" <| + Test.group "Match.end" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 + + Test.specify "should return the end of a group by index" <| + match.end 1 . should_equal 6 + + Test.specify "should return the end of a group by name" <| + match.end "letters" . should_equal 18 + + Test.specify "should return Nothing if the group didn't match" <| + match.end 3 . should_equal Nothing + match.end "empty" . should_equal Nothing + + Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| + match.end 5 . should_fail_with No_Such_Group + match.end "nonexistent" . should_fail_with No_Such_Group + + Test.group "Match.utf_16_start" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 + + Test.specify "should return the start of a group by index" <| + match.utf_16_start 1 . should_equal 0 + + Test.specify "should return the start of a group by name" <| + match.utf_16_start "letters" . should_equal 6 + + Test.specify "should return Nothing if the group didn't match" <| + match.utf_16_start 3 . should_equal Nothing + match.utf_16_start "empty" . should_equal Nothing + + Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| + match.utf_16_start 5 . should_fail_with No_Such_Group + match.utf_16_start "nonexistent" . should_fail_with No_Such_Group + + Test.group "Match.utf_16_end" <| pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" input = "aa ab abc a bc bcd" match = pattern.match input match . should_be_a Match_2 Test.specify "should return the end of a group by index" <| - match.end 1 . should_equal 6 + match.utf_16_end 1 . should_equal 6 Test.specify "should return the end of a group by name" <| - match.end "letters" . should_equal 18 + match.utf_16_end "letters" . should_equal 18 Test.specify "should return Nothing if the group didn't match" <| - match.end 3 . should_equal Nothing - match.end "empty" . should_equal Nothing + match.utf_16_end 3 . should_equal Nothing + match.utf_16_end "empty" . should_equal Nothing Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.end 5 . should_fail_with No_Such_Group - match.end "nonexistent" . should_fail_with No_Such_Group + match.utf_16_end 5 . should_fail_with No_Such_Group + match.utf_16_end "nonexistent" . should_fail_with No_Such_Group - Test.group "Match.utf16_start" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 + Test.group "Match.span" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 - Test.specify "should return the start of a group by index" <| - match.utf16_start 1 . should_equal 0 + Test.specify "should get the span of a group by index" <| + match.span 1 . should_equal (Span.Value (0.up_to 6) input) - Test.specify "should return the start of a group by name" <| - match.utf16_start "letters" . should_equal 6 + Test.specify "should get the span of a group by name" <| + match.span "letters" . should_equal (Span.Value (6.up_to 18) input) - Test.specify "should return Nothing if the group didn't match" <| - match.utf16_start 3 . should_equal Nothing - match.utf16_start "empty" . should_equal Nothing + Test.specify "should return Nothing if the group didn't match" <| + match.span 3 . should_equal Nothing + match.span "empty" . should_equal Nothing - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.utf16_start 5 . should_fail_with No_Such_Group - match.utf16_start "nonexistent" . should_fail_with No_Such_Group + Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| + match.span 5 . should_fail_with No_Such_Group + match.span "nonexistent" . should_fail_with No_Such_Group - Test.group "Match.utf16_end" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 + Test.group "Match.utf_16_span" <| + pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match_2 - Test.specify "should return the end of a group by index" <| - match.utf16_end 1 . should_equal 6 + Test.specify "should get the UTF16 span of a group by index" <| + match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input) - Test.specify "should return the end of a group by name" <| - match.utf16_end "letters" . should_equal 18 + Test.specify "should get the UTF16 span of a group by name" <| + match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input) - Test.specify "should return Nothing if the group didn't match" <| - match.utf16_end 3 . should_equal Nothing - match.utf16_end "empty" . should_equal Nothing + Test.specify "should return Nothing if the group didn't match" <| + match.utf_16_span 3 . should_equal Nothing + match.utf_16_span "empty" . should_equal Nothing - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.utf16_end 5 . should_fail_with No_Such_Group - match.utf16_end "nonexistent" . should_fail_with No_Such_Group + Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| + match.utf_16_span 5 . should_fail_with No_Such_Group + match.utf_16_span "nonexistent" . should_fail_with No_Such_Group - ## - Test.group "Match.span" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value + Test.group "caching" <| + Test.specify "Replacer cache drops old values" <| + pattern = Regex_2.compile('([a-c])') - Test.specify "should get the span of a group by index" <| - match.span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input) - - Test.specify "should get the span of a group by name" <| - match.span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input) - - Test.specify "should return Nothing if the group didn't match" <| - match.span 3 . should_equal Nothing - match.span "empty" . should_equal Nothing - - Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| - match.span 5 . should_fail_with No_Such_Group - match.span "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.start_position" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the region start over which self match was performed" <| - match.start_position . should_equal 0 - - Test.group "Match.end_position" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the region end over which self match was performed" <| - match.end_position . should_equal 18 - - Test.group "Regex options handling" <| - Test.specify "should work properly with flag options" <| - flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[] - flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments] - - Test.specify "should properly override vector options" <| - flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[Regex_Option.Multiline, Regex_Option.Case_Insensitive] - flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Case_Insensitive, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments] - - Test.group "Regexes" <| - Test.specify "should be able to be compiled" <| - pattern = Regex.compile "(?..)" case_insensitive=True - pattern . should_be_a Default_Engine.Pattern.Value - pattern.options . should_equal [Regex_Option.Case_Insensitive] - - Test.specify "should be able to be escaped" <| - pattern = "http://example.com" - Regex.escape pattern . should_equal "\Qhttp://example.com\E" - - ## TODO: Missing tests for No_Such_Group_Error + # Add enough values to flush out the first values. + 0.up_to get_lru_size+1 . map i-> + result = pattern.replace "abcdef" ("$1$1x" + i.to_text) + result . should_not_equal Nothing + replacer_cache_lookup "$1$1x0" . should_equal Nothing + replacer_cache_lookup "$1$1x1" . should_not_equal Nothing main = Test_Suite.run_main spec diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index be1bf9e0a65..ac421088609 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -7,6 +7,7 @@ import Standard.Base.Errors.Common.Index_Out_Of_Bounds import Standard.Base.Errors.Common.Incomparable_Values import Standard.Base.Errors.Common.Type_Error import Standard.Base.Errors.Illegal_Argument.Illegal_Argument +import Standard.Base.IO import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine @@ -15,6 +16,7 @@ from Standard.Base.Data.Index_Sub_Range.Index_Sub_Range import all from Standard.Test import Test, Test_Suite import Standard.Test.Extensions +import Standard.Base.Data.Text.Extensions type Auto Value a @@ -1190,9 +1192,9 @@ spec = "Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing Test.specify "find should produce correct spans" <| - "Hello World!".find ".o" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!") - "Hello World!".find_all ".o" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"] - "foobar".find "BAR" Case_Sensitivity.Insensitive . grapheme_span 0 . should_equal (Span.Value (3.up_to 6) "foobar") + "Hello World!".find ".o" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 5) "Hello World!") + "Hello World!".find_all ".o" . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"] + "foobar".find "BAR" Case_Sensitivity.Insensitive . span 0 . should_equal (Span.Value (3.up_to 6) "foobar") Test.specify "should handle accents and other multi-point graphemes" <| accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h' @@ -1201,29 +1203,20 @@ spec = accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}' # Check both UTF16 spans - accents.find_all 'h' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents] - accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents] + accents.find_all 'h' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (6.up_to 7) accents, Utf_16_Span.Value (9.up_to 10) accents] + accents.find_all 'e\u{301}' . map (match-> match.utf_16_span 0) . should_equal [Utf_16_Span.Value (2.up_to 4) accents, Utf_16_Span.Value (7.up_to 9) accents] # Check both grapheme spans - accents.find_all 'h' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents] - accents.find_all 'e\u{301}' . map (match-> match.grapheme_span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents] + accents.find_all 'h' . map (match-> match.span 0) . should_equal [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents] + accents.find_all 'e\u{301}' . map (match-> match.span 0) . should_equal [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents] # Check contents to make sure the spans' ranges are ok accents.find 'h' . text 0 . should_equal 'h' accents.find 'e\u{301}' . text 0 . should_equal 'e\u{301}' - Test.specify "should correctly handle regex edge cases in locate" pending="Figure out how to make Regex correctly handle empty patterns." <| - regex = Regex_Matcher.Value - "".match "foo" matcher=regex . should_equal Nothing - "".match "foo" matcher=regex mode=Matching_Mode.Last . should_equal Nothing - "".match_all "foo" matcher=regex . should_equal [] - "".match "" matcher=regex . should_equal "" - "".match_all "" matcher=regex . should_equal [""] - "".match "" matcher=regex mode=Matching_Mode.Last . should_equal "" - abc = 'A\u{301}ßC' - abc.match "" matcher=regex . should_equal abc - abc.match_all "" matcher=regex . should_equal ["", "", "", "", ""] - abc.match "" matcher=regex mode=Matching_Mode.Last . should_equal "" + Test.specify "should correctly handle regex edge cases in `find`" <| + "".find "foo" . should_equal Nothing + "".find_all "foo" . should_equal [] Test.specify "should handle overlapping matches as shown in the examples" <| "aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (1.up_to 3) "aaa") @@ -1256,6 +1249,12 @@ spec = txt.find "^m..a..z.a$" . text 0 . should_equal "maza건반zaa" txt.find "a..z" . text 0 . should_equal "a건반z" + Test.specify "`find` with an empty pattern should be an error" <| + 'b'.find '' . should_fail_with Illegal_Argument + + Test.specify "`find_all` with an empty pattern should be an error" <| + 'b'.find_all '' . should_fail_with Illegal_Argument + Test.specify "should be possible in case-insensitive mode" <| "MY".find "my" Case_Sensitivity.Insensitive . text 0 . should_equal "MY" @@ -1281,20 +1280,20 @@ spec = expose normalization methods to allow developers to do it themselves. accents = 'a\u{301}e\u{301}o\u{301}' - accents.find accent_1 . grapheme_span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}') + accents.find accent_1 . span 0 . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}') Test.specify "can return a vector of all match groups" <| "abc".find "ab((c)|(d))" . groups . should_equal ['abc', 'c', 'c', Nothing] - Test.specify "should default to group 0 in .span and .grapheme_span" <| - "abacadae".find "a[bc]" . span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae") - 'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . grapheme_span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}') + Test.specify "should default to group 0 in .span and .span" <| + "abacadae".find "a[bc]" . utf_16_span . should_equal (Utf_16_Span.Value (0.up_to 2) "abacadae") + 'a\u{301}e\u{301}o\u{301}'.find 'e\u{301}' . span . should_equal (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}') Test.specify "should allow to match one or more occurrences of a pattern in the text" <| - "abacadae".find_all "a[bc]" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"] - "abacadae".find_all "a." . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"] - "abacadae".find_all "a.*" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"] - "abacadae".find_all "a.+?" . map (match-> match.grapheme_span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"] + "abacadae".find_all "a[bc]" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"] + "abacadae".find_all "a." . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"] + "abacadae".find_all "a.*" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 8) "abacadae"] + "abacadae".find_all "a.+?" . map (match-> match.span 0) . should_equal [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"] Test.specify "should allow access to match groups by number" <| "abcddd".find "ab(c(d+))" . text 0 . should_equal "abcddd" @@ -1331,8 +1330,13 @@ spec = Test.specify "should expand a partial-grapheme match to the whole grapheme" <| 'e\u{301}'.find '\u{301}' . text 0 . should_equal 'e\u{301}' + Test.specify "should not allow non-default locale" <| + locale = Locale.new "en" "GB" "UTF-8" + 'a'.find 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument + 'a'.find_all 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument + Test.group "Text.match" <| - Test.specify "should default to regex" <| + Test.specify "should work correctly" <| "My Text: Goes Here".match "^My Text: (.+)$" . should_be_true "555-801-1923".match "^\d{3}-\d{3}-\d{4}$" . should_be_true "Hello".match "^[a-z]+$" . should_be_false @@ -1344,12 +1348,19 @@ spec = "abcd".match "abc" . should_be_false "x".match "[a-z]" . should_be_true + Test.specify "`match` with an empty pattern should be an error" <| + 'b'.match '' . should_fail_with Illegal_Argument + Test.specify "should be possible on unicode text" <| "Korean: 건반".match "^Korean: (.+)$" . should_be_true Test.specify "should be possible in case-insensitive mode" <| "MY".match "my" Case_Sensitivity.Insensitive . should_be_true + Test.specify "should not allow non-default locale" <| + locale = Locale.new "en" "GB" "UTF-8" + 'a'.match 'a' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_fail_with Illegal_Argument + Test.group "Regex splitting" <| Test.specify "should be possible on text" <| splits = "abcde".split "[bd]" Regex_Matcher.Value @@ -1402,141 +1413,113 @@ spec = Test.group "Text.replace" <| Test.specify "should work as in examples" <| 'aaa'.replace 'aa' 'b' . should_equal 'ba' - "Hello World!".replace "[lo]" "#" matcher=Regex_Matcher.Value . should_equal "He### W#r#d!" - "Hello World!".replace "l" "#" mode=Matching_Mode.First . should_equal "He#lo World!" - '"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' matcher=Regex_Matcher.Value . should_equal '(abc) foo (bar) baz' - 'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA' - 'affib'.replace 'i' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb' + "Hello World!".replace "[lo]" "#" use_regex=True . should_equal "He### W#r#d!" + "Hello World!".replace "l" "#" only_first=True . should_equal "He#lo World!" + '"abc" foo "bar" baz'.replace '"(.*?)"' '($1)' use_regex=True . should_equal '(abc) foo (bar) baz' + + Test.specify "works when mapped over a vector of inputs" <| + inputs = ["axyz", "bxyz", "xabcz", "zazaz"] + inputs.map (s-> s.replace "[abc]" "q" use_regex=True) . should_equal ["qxyz", "qxyz", "xqqqz", "zqzqz"] Test.specify "should correctly handle empty-string edge cases" <| - [Regex_Mode.All, Matching_Mode.First, Matching_Mode.Last] . each mode-> - 'aaa'.replace '' 'foo' mode=mode . should_equal 'aaa' - ''.replace '' '' mode=mode . should_equal '' - 'a'.replace 'a' '' mode=mode . should_equal '' - ''.replace 'a' 'b' mode=mode . should_equal '' + [True, False] . each only_first-> + 'aaa'.replace '' 'foo' only_first=only_first . should_equal 'aaa' + 'a'.replace 'a' '' only_first=only_first . should_equal '' + ''.replace 'a' 'b' only_first=only_first . should_equal '' - 'aba' . replace 'a' '' Matching_Mode.First . should_equal 'ba' - 'aba' . replace 'a' '' Matching_Mode.Last . should_equal 'ab' + 'aba' . replace 'a' '' only_first=True . should_equal 'ba' 'aba' . replace 'a' '' . should_equal 'b' 'aba' . replace 'c' '' . should_equal 'aba' Test.specify "should correctly handle first, all and last matching with overlapping occurrences" <| "aaa aaa".replace "aa" "c" . should_equal "ca ca" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.First . should_equal "ca aaa" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.Last . should_equal "aaa ac" + "aaa aaa".replace "aa" "c" only_first=True . should_equal "ca aaa" + + Test.specify "Regex `replace` with an empty pattern should be an error" <| + 'b'.replace '' 'c' use_regex=True . should_fail_with Illegal_Argument Test.specify "should correctly handle case-insensitive matches" <| - 'AaąĄ' . replace "A" "-" matcher=Text_Matcher.Case_Insensitive . should_equal '--ąĄ' + 'AaąĄ' . replace "A" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal '--ąĄ' 'AaąĄ' . replace "A" "-" . should_equal '-aąĄ' - 'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Sensitive . should_equal 'HeLlO wOrLd' - 'HeLlO wOrLd' . replace 'hElLo' 'Hey,' matcher=Text_Matcher.Case_Insensitive . should_equal 'Hey, wOrLd' + 'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Sensitive . should_equal 'HeLlO wOrLd' + 'HeLlO wOrLd' . replace 'hElLo' 'Hey,' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'Hey, wOrLd' "Iiİı" . replace "i" "-" . should_equal "I-İı" "Iiİı" . replace "I" "-" . should_equal "-iİı" "Iiİı" . replace "İ" "-" . should_equal "Ii-ı" "Iiİı" . replace "ı" "-" . should_equal "Iiİ-" - "Iiİı" . replace "i" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı" - "Iiİı" . replace "I" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "--İı" - "Iiİı" . replace "İ" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Ii-ı" - "Iiİı" . replace "ı" "-" matcher=Text_Matcher.Case_Insensitive . should_equal "Iiİ-" + "Iiİı" . replace "i" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı" + "Iiİı" . replace "I" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "--İı" + "Iiİı" . replace "İ" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Ii-ı" + "Iiİı" . replace "ı" "-" case_sensitivity=Case_Sensitivity.Insensitive . should_equal "Iiİ-" - tr_insensitive = Text_Matcher.Case_Insensitive (Locale.new "tr") - "Iiİı" . replace "i" "-" matcher=tr_insensitive . should_equal "I--ı" - "Iiİı" . replace "I" "-" matcher=tr_insensitive . should_equal "-iİ-" - "Iiİı" . replace "İ" "-" matcher=tr_insensitive . should_equal "I--ı" - "Iiİı" . replace "ı" "-" matcher=tr_insensitive . should_equal "-iİ-" + Test.specify "should correctly handle Unicode" <| + 'ß'.replace 'S' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'AA' + 'ß'.replace 'ß' 'A' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'A' + 'affib'.replace 'i' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb' + 'affib'.replace 'ffi' 'X' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'aXb' - Test.specify "should correctly handle Unicode edge cases" <| 'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}' - 'sśs\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'Ośs\u{301}' - 'śs\u{301}s' . replace 's' 'O' Matching_Mode.First . should_equal 'śs\u{301}O' + 'śss\u{301}' . replace 's' 'O' only_first=True . should_equal 'śOs\u{301}' 'sśs\u{301}' . replace 'ś' 'O' . should_equal 'sOO' + 'śss\u{301}' . replace 'ś' 'O' only_first=True . should_equal 'Oss\u{301}' + 'sśs\u{301}' . replace 's\u{301}' 'O' . should_equal 'sOO' + 's\u{301}śs' . replace 's\u{301}' 'O' . should_equal 'OOs' 'SŚS\u{301}' . replace 's' 'O' . should_equal 'SŚS\u{301}' - 'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last . should_equal 'SŚS\u{301}' - 'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First . should_equal 'ŚS\u{301}S' + 'ŚS\u{301}S' . replace 's' 'O' only_first=True . should_equal 'ŚS\u{301}S' 'SŚS\u{301}' . replace 'ś' 'O' . should_equal 'SŚS\u{301}' 'SŚS\u{301}' . replace 's\u{301}' 'O' . should_equal 'SŚS\u{301}' - 'SŚS\u{301}' . replace 's' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}' - 'SŚS\u{301}' . replace 's' 'O' Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'OŚS\u{301}' - 'ŚS\u{301}S' . replace 's' 'O' Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'ŚS\u{301}O' + 'SŚS\u{301}' . replace 's' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'OŚS\u{301}' + 'ŚS\u{301}S' . replace 's' 'O' only_first=True case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'ŚS\u{301}O' # 'ŚO\u{301}O' # 'ŚOS\u{301}S' - 'SŚS\u{301}' . replace 'ś' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO' - 'SŚS\u{301}' . replace 's\u{301}' 'O' matcher=Text_Matcher.Case_Insensitive . should_equal 'SOO' + 'SŚS\u{301}' . replace 'ś' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO' + 'SŚS\u{301}' . replace 's\u{301}' 'O' case_sensitivity=Case_Sensitivity.Insensitive . should_equal 'SOO' '✨🚀🚧😍😃😍😎😙😉☺' . replace '🚧😍' '|-|:)' . should_equal '✨🚀|-|:)😃😍😎😙😉☺' 'Rocket Science' . replace 'Rocket' '🚀' . should_equal '🚀 Science' "Korean: 건반".replace "건반" "keyboard" . should_equal "Korean: keyboard" - Test.specify "will approximate ligature matches" <| - # TODO do we want to improve this? highly non-trivial for very rare edge cases - ## Currently we lack 'resolution' to extract a partial match from - the ligature to keep it, probably would need some special - mapping. - 'ffiffi'.replace 'ff' 'aa' matcher=Text_Matcher.Case_Insensitive . should_equal 'aaaa' - 'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'aaffi' - 'ffiffi'.replace 'ff' 'aa' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'ffiaa' - 'affiffib'.replace 'IF' 'X' matcher=Text_Matcher.Case_Insensitive . should_equal 'aXb' - 'aiffiffz' . replace 'if' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'a--fz' - 'AFFIB'.replace 'ffi' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'A-B' - - 'ß'.replace 'SS' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'A' - 'ß'.replace 'S' 'A' matcher=Text_Matcher.Case_Insensitive . should_equal 'AA' - 'ß'.replace 'S' 'A' mode=Matching_Mode.First matcher=Text_Matcher.Case_Insensitive . should_equal 'A' - 'ß'.replace 'S' 'A' mode=Matching_Mode.Last matcher=Text_Matcher.Case_Insensitive . should_equal 'A' - 'STRASSE'.replace 'ß' '-' matcher=Text_Matcher.Case_Insensitive . should_equal 'STRA-E' + Test.specify "regex and non-regex replace handle accented grapheme splitting differently" <| + 'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}' + 'sśs\u{301}' . replace 's' 'O' use_regex=True . should_equal 'OśO\u{301}' Test.specify "should perform simple replacement in Regex mode" <| - "ababab".replace "b" "a" matcher=Regex_Matcher.Value . should_equal "aaaaaa" - "ababab".replace "b" "a" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "aaabab" - "ababab".replace "b" "a" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ababaa" + "ababab".replace "b" "a" use_regex=True . should_equal "aaaaaa" + "ababab".replace "b" "a" only_first=True use_regex=True . should_equal "aaabab" - "aaaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "cc" - "aaaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "caa" - "aaaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aac" + "aaaa".replace "aa" "c" use_regex=True . should_equal "cc" + "aaaa".replace "aa" "c" only_first=True use_regex=True . should_equal "caa" - "aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca" - "aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca" - "aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "ac" - "aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "ca" + "aaa".replace "aa" "c" use_regex=True . should_equal "ca" + "aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca" - "aaa aaa".replace "aa" "c" matcher=Text_Matcher.Case_Sensitive . should_equal "ca ca" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Text_Matcher.Case_Sensitive . should_equal "ca aaa" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Text_Matcher.Case_Sensitive . should_equal "aaa ac" - "aaa aaa".replace "aa" "c" matcher=Regex_Matcher.Value . should_equal "ca ca" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.First matcher=Regex_Matcher.Value . should_equal "ca aaa" - "aaa aaa".replace "aa" "c" mode=Matching_Mode.Last matcher=Regex_Matcher.Value . should_equal "aaa ca" + "aaa aaa".replace "aa" "c" case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca ca" + "aaa aaa".replace "aa" "c" only_first=True case_sensitivity=Case_Sensitivity.Sensitive use_regex=True . should_equal "ca aaa" + "aaa aaa".replace "aa" "c" use_regex=True . should_equal "ca ca" + "aaa aaa".replace "aa" "c" only_first=True use_regex=True . should_equal "ca aaa" Test.specify "in Regex mode should work with Unicode" <| - "Korean: 건반".replace "건반" "keyboard" matcher=Regex_Matcher.Value . should_equal "Korean: keyboard" - 'sśs\u{301}'.replace 'ś' '-' matcher=Regex_Matcher.Value . should_equal 's--' - 'sśs\u{301}'.replace 's\u{301}' '-' matcher=Regex_Matcher.Value . should_equal 's--' - - Test.specify "in Regex mode should support various Regex options" <| - r1 = "İiİ".replace "\w" "a" matcher=(Regex_Matcher.Value match_ascii=True) - r1 . should_equal "İaİ" - r2 = "abaBa".replace "b" "a" matcher=(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) - r2 . should_equal "aaaaa" - r3 = 'ab\na'.replace "b." "a" matcher=(Regex_Matcher.Value dot_matches_newline=True) - r3 . should_equal "aaa" - - text = """ - Foo - bar - r4 = text.replace '\n' "" matcher=(Regex_Matcher.Value multiline=True) - r4 . should_equal "Foobar" - - r5 = "ababd".replace "b\w # Replacing a `b` followed by any word character" "a" matcher=(Regex_Matcher.Value comments=True) - r5 . should_equal "aaa" + "Korean: 건반".replace "건반" "keyboard" use_regex=True . should_equal "Korean: keyboard" + 'sśs\u{301}'.replace 'ś' '-' use_regex=True . should_equal 's-s\u{301}' + 'sśs\u{301}'.replace 's\u{301}' '-' use_regex=True . should_equal 'sś-' Test.specify "in Regex mode should allow referring to capture groups in substitutions" <| - 'content'.replace '(.*?)' '$2 is at $1' matcher=Regex_Matcher.Value . should_equal 'content is at url' - 'content'.replace '(?.*?)' '${text} is at ${address}' matcher=Regex_Matcher.Value . should_equal 'content is at url' + 'content'.replace '(.*?)' '$2 is at $1' use_regex=True . should_equal 'content is at url' + 'content'.replace '(?.*?)' '$ is at $
' use_regex=True . should_equal 'content is at url' + + Test.specify "should not allow non-default locale in regex replace" <| + locale = Locale.new "en" "GB" "UTF-8" + 'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) use_regex=True . should_fail_with Illegal_Argument + + Test.specify "should allow non-default locale in text replace" <| + locale = Locale.new "en" "GB" "UTF-8" + 'a'.replace 'a' 'b' case_sensitivity=(Case_Sensitivity.Insensitive locale) . should_equal 'b' main = Test_Suite.run_main spec