diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso index 1ccfa8143a..7023cbb006 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Filter_Condition.enso @@ -100,13 +100,11 @@ type Filter_Condition Table operations, it can accept another column - then the corresponding values from the source column and the provided column are checked. - ! Known Bugs - There is a known bug in Java Regex where escape characters are not - handled properly in Unicode-normalized matching mode. Due to this - limitation, Unicode normalization has been disabled for this function, - so beware that some equivalent graphemes like 'ś' and 's\u0301' will - not be matched. - See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 + ! Known Limitations. + The Truffle regex engine does not transparently handle normalization. + Due to this limitation, Unicode normalization has been disabled for + this function, so beware that some equivalent graphemes like 'ś' and + 's\u0301' will not be matched. Like pattern:Text ## Does the value not match the SQL pattern (Text only)? @@ -121,13 +119,11 @@ type Filter_Condition Table operations, it can accept another column - then the corresponding values from the source column and the provided column are checked. - ! Known Bugs - There is a known bug in Java Regex where escape characters are not - handled properly in Unicode-normalized matching mode. Due to this - limitation, Unicode normalization has been disabled for this function, - so beware that some equivalent graphemes like 'ś' and 's\u0301' will - not be matched. - See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 + ! Known Limitations. + The Truffle regex engine does not transparently handle normalization. + Due to this limitation, Unicode normalization has been disabled for + this function, so beware that some equivalent graphemes like 'ś' and + 's\u0301' will not be matched. Not_Like pattern:Text ## Is the value contained in `values`? @@ -212,7 +208,4 @@ type Filter_Condition ## PRIVATE sql_like_to_regex sql_pattern = regex_pattern = Regex_Utils.sql_like_pattern_to_regex sql_pattern - ## There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting. - https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926 - Once that bug is fixed, `match_ascii` may be set back to `False`. - Regex.compile regex_pattern dot_matches_newline=True match_ascii=True + Regex.compile regex_pattern diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso index b2b731c25c..15852a2c1a 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text.enso @@ -9,7 +9,6 @@ from project.Data.Boolean import Boolean, True, False polyglot java import org.enso.base.Text_Utils - ## Enso's text type. Enso's text type is natively unicode aware, and will handle arbitrary diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index 1cd5c610f2..95d7ffc615 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -12,10 +12,9 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity import project.Data.Text.Encoding.Encoding import project.Data.Text.Location.Location import project.Data.Text.Matching_Mode.Matching_Mode +import project.Data.Text.Regex import project.Data.Text.Regex.Match.Match -import project.Data.Text.Regex.Regex_Mode.Regex_Mode -import project.Data.Text.Regex_2 -import project.Data.Text.Regex_2.Regex_Syntax_Error +import project.Data.Text.Regex.Regex_Syntax_Error import project.Data.Text.Span.Span import project.Data.Text.Span.Utf_16_Span import project.Data.Text.Text @@ -233,7 +232,7 @@ Text.characters self = Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive + compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive compiled_pattern.match self ## Finds all the matches of the regular expression `pattern` in `self`, @@ -260,7 +259,7 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive + compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive compiled_pattern.match_all self ## ALIAS Check Matches @@ -290,7 +289,7 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive = case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive + compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive compiled_pattern.matches self ## ALIAS Split Text @@ -348,7 +347,7 @@ Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_re True -> case delimiter of _ : Text -> case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive + compiled_pattern = Regex.compile delimiter case_insensitive=case_insensitive compiled_pattern.split self _ : Vector -> parenthesize s = "(?:" + s + ")" @@ -383,7 +382,7 @@ Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_re Text.tokenize : Text -> Case_Sensitivity -> Vector Text Text.tokenize self pattern="." case_sensitivity=Case_Sensitivity.Sensitive = case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive + compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive compiled_pattern.tokenize self ## ALIAS Replace Text @@ -477,7 +476,7 @@ Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive o Text_Utils.replace_spans self spans_array replacement True -> case_insensitive = case_sensitivity.is_case_insensitive_in_memory - compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive + compiled_pattern = Regex.compile term case_insensitive=case_insensitive compiled_pattern.replace self replacement only_first ## ALIAS Get Words diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Helpers.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Helpers.enso new file mode 100644 index 0000000000..ef6c323d63 --- /dev/null +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Helpers.enso @@ -0,0 +1,17 @@ +from Standard.Base import all + +import project.Any.Any +import project.Data.Locale.Locale +import project.Data.Text.Case_Sensitivity.Case_Sensitivity +import project.Errors.Common.Type_Error +import project.Meta + +## PRIVATE + + Assert that `text_maybe` is a Text, then call the action. +expect_text : Any -> Any -> Any ! Type_Error +expect_text text_maybe ~action = case text_maybe of + _ : Text -> action + _ -> + Error.throw (Type_Error.Error Text (Meta.type_of text_maybe) "text_maybe") + diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching.enso deleted file mode 100644 index d963698bf8..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Matching.enso +++ /dev/null @@ -1,110 +0,0 @@ -import project.Data.Numbers.Integer -import project.Data.Pair.Pair -import project.Data.Range.Extensions -import project.Data.Text.Text -import project.Data.Vector.Vector -import project.Errors.Problem_Behavior.Problem_Behavior -import project.Panic.Panic -import project.Panic.Wrapped_Dataflow_Error - -from project.Data.Boolean import Boolean, True, False - -## UNSTABLE - An error indicating that some criteria did not match any names in the input. -type No_Matches_Found - Error (criteria : Vector Text) - - to_display_text : Text - to_display_text self = - "The criteria "+self.criteria.to_text+" did not match any names in the input." - -## PRIVATE -match_criteria_implementation matcher objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning = - result = internal_match_criteria_implementation matcher objects criteria reorder name_mapper - unmatched_criteria = result.second - problems = if unmatched_criteria.is_empty then [] else - [No_Matches_Found.Error unmatched_criteria] - on_problems.attach_problems_after result.first problems - -## PRIVATE -match_criteria_callback matcher objects criteria problem_callback reorder=False name_mapper=(x->x) = - result = internal_match_criteria_implementation matcher objects criteria reorder name_mapper - unmatched_criteria = result.second - problem_callback unmatched_criteria - result.first - -type Match_Matrix - ## PRIVATE - A helper type holding a matrix of matches. - Value matrix criteria objects - - # Checks if the ith object is matched by any criterion. - is_object_matched_by_anything : Integer -> Boolean - is_object_matched_by_anything self i = - self.matrix.at i . any x->x - - # Checks if the ith criterion matches any objects. - does_criterion_match_anything : Integer -> Boolean - does_criterion_match_anything self i = - self.matrix.map (col -> col.at i) . any x->x - - ## PRIVATE - Extracts the list of criteria that did not have any matches. - unmatched_criteria self = - checked_criteria = self.criteria.map_with_index j-> criterion-> - has_matches = self.does_criterion_match_anything j - Pair.new has_matches criterion - checked_criteria.filter (p -> p.first.not) . map .second - - ## PRIVATE - Returns the list of criteria that match the ith object. - criteria_matching_object : Integer -> Vector - criteria_matching_object self i = - self.criteria.filter_with_index j-> _-> - self.matrix . at i . at j - - ## PRIVATE - Returns the list of criteria indices that match the ith object. - criteria_indices_matching_object : Integer -> Vector - criteria_indices_matching_object self i = - (0.up_to self.criteria.length).filter j-> - self.matrix . at i . at j - -## PRIVATE - Generates a matrix specifying which criteria match which object. - - The returned `match_matrix` satisfies the following condition: - `match_matrix . at i . at j` is `True` if and only if `objects.at i` matches - `criteria.at j`. -make_match_matrix matcher objects criteria object_name_mapper=(x->x) criterion_mapper=(x->x) = - matrix = objects.map obj-> - criteria.map criterion-> - matcher.match_single_criterion (object_name_mapper obj) (criterion_mapper criterion) - Match_Matrix.Value matrix criteria objects - -## PRIVATE -internal_match_criteria_implementation matcher objects criteria reorder=False name_mapper=(x->x) = Panic.catch Wrapped_Dataflow_Error (handler = x-> x.payload.unwrap) <| - ## TODO [RW] discuss: this line of code also shows an issue we had with ensuring input dataflow-errors are correctly propagated, later on we stopped doing that and testing for that as it was too cumbersome. Maybe it could be helped with an @Accepts_Error annotation similar to the one from the interpreter??? - [matcher, objects, criteria, reorder, name_mapper] . each v-> - Panic.rethrow (v.map_error Wrapped_Dataflow_Error.Error) - - match_matrix = make_match_matrix matcher objects criteria name_mapper - unmatched_criteria = match_matrix.unmatched_criteria - - # Selects object indices which satisfy the provided predicate. - select_matching_indices : (Integer -> Boolean) -> Vector Text - select_matching_indices matcher = - 0.up_to objects.length . to_vector . filter matcher - - selected_indices = case reorder of - True -> - nested_indices = 0.up_to criteria.length . map j-> - is_object_matched_by_this_criterion i = - match_matrix.matrix.at i . at j - select_matching_indices is_object_matched_by_this_criterion - nested_indices.flat_map x->x . distinct - False -> - select_matching_indices match_matrix.is_object_matched_by_anything - - result = selected_indices.map objects.at - Pair.new result unmatched_criteria diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso index c24e966ee2..269a58461c 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex.enso @@ -1,124 +1,63 @@ -## This module contains the basic interface to the more advanced functionality - of Enso's regular expression engine. - - TODO Examples - -import project.Data.Boolean.Boolean +import project.Any.Any import project.Data.Numbers.Integer -import project.Data.Text.Regex.Engine.Engine +import project.Data.Text.Prim_Text_Helper import project.Data.Text.Regex.Pattern.Pattern -import project.Data.Text.Regex.Engine.Default -import project.Data.Text.Regex.Regex_Option.Regex_Option import project.Data.Text.Text -import project.Data.Vector.Vector -import project.Errors.Common.Compile_Error +import project.Error.Error +import project.Errors.Illegal_Argument.Illegal_Argument import project.Nothing.Nothing +import project.Panic.Panic +from project.Data.Boolean import Boolean, True, False +from project.Errors.Common import Syntax_Error + +polyglot java import org.enso.base.Regex_Utils ## Compile the provided `expression` into a regex pattern that can be used for matching. Arguments - expression: The text representing the regular expression that you want to - compile. - - engine: The regular expression engine to use. It defaults to Enso's - built-in one which has good performance and a full feature-set. - - match_ascii: Enables or disables pure-ASCII matching for the regex. If you - know your data only contains ASCII then you can enable this for a - performance boost on some regex engines. + compile. Must be non-empty. - case_insensitive: Enables or disables case-insensitive matching. Case insensitive matching behaves as if it normalises the case of all input text before matching on it. - - dot_matches_newline: Enables or disables the dot matches newline option. - This specifies that the `.` special character should match everything - _including_ newline characters. Without this flag, it will match all - characters _except_ newlines. - - multiline: Enables or disables the multiline option. Multiline specifies - that the `^` and `$` pattern characters match the start and end of lines, - as well as the start and end of the input respectively. - - comments: Enables or disables the comments mode for the regular expression. - In comments mode, the following changes apply: - - Whitespace within the pattern is ignored, except when within a - character class or when preceded by an unescaped backslash, or within - grouping constructs (e.g. `(?...)`). - - When a line contains a `#`, that is not in a character class and is not - preceded by an unescaped backslash, all characters from the leftmost - such `#` to the end of the line are ignored. That is to say, they act - as _comments_ in the regex. - - extra_opts: Specifies additional options in a vector. This allows options - to be supplied and computed without having to break them out into arguments - to the function. Where these overlap with one of the flags (`match_ascii`, - `case_insensitive`, `dot_matches_newline`, `multiline` and `verbose`), the - flags take precedence. - ! Boolean Flags and Extra Options - This function contains a number of arguments that are boolean flags that - enable or disable common options for the regex. At the same time, it also - provides the ability to specify options in the `extra_opts` argument. - - Where one of the flags is _set_ (has the value `True` or `False`), the - value of the flag takes precedence over the value in `extra_opts` when - merging the options to the engine. The flags are _unset_ (have value - `Nothing`) by default. + If an empty regex is used, `compile` throws an Illegal_Argument error. ? Why Compile? While many regex engines are able to cache ad-hoc patterns, it is often useful to be able to manually retain a pattern that you have computed. This function exists so you can hold onto the resultant `Pattern` object, instead of immediately proceeding to match using it. -compile : Text -> Engine -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Vector Regex_Option -> Pattern ! Compile_Error -compile expression engine=Default.new match_ascii=Nothing case_insensitive=Nothing dot_matches_newline=Nothing multiline=Nothing comments=Nothing extra_opts=[] = - options_vec = from_flags match_ascii case_insensitive dot_matches_newline multiline comments extra_opts - engine.compile expression options_vec +compile : Text -> Boolean | Nothing -> Pattern ! Regex_Syntax_Error | Illegal_Argument +compile self expression case_insensitive=Nothing = + if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else + options_string = if case_insensitive == True then "usgi" else "usg" -## Escape the special characters in `expression` such that the result is a valid - literal pattern for the original string. + internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic-> + Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message)) + + Pattern.Value internal_regex_object + +## ADVANCED + + Escape the special characters in `expression` such that the result is a + valid literal pattern for the original string. Arguments: - expression: The expression to escape metacharacters in. - - engine: The regular expression engine to use. It defaults to Enso's - built-in one which has good performance and a full feature-set. - ! Matching Engines - Care should be taken to ensure that you use the same engine for escaping - and matching, as engine syntax may differ in certain cases. -escape : Text -> Engine -> Text -escape expression engine=Default.new = engine.escape expression + > Example + Turn a Text into a regex that matches that string exactly. -## PRIVATE + example_escape = + literal_string = "\!\.|abcde" + Regex.escape literal_string +escape : Text -> Text +escape self expression = Regex_Utils.regexQuote expression - Turns the options flags into a vector of options. -from_flags : Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Vector Regex_Option -> Vector Regex_Option -from_flags match_ascii case_insensitive dot_matches_newline multiline comments extra_opts = - builder = Vector.new_builder - - process_override : Boolean | Nothing -> Regex_Option -> Nothing - process_override param option = case param of - _ : Boolean -> if param then builder.append option - Nothing -> if extra_opts.contains option then builder.append option - - process_override match_ascii Regex_Option.Ascii_Matching - process_override case_insensitive Regex_Option.Case_Insensitive - process_override dot_matches_newline Regex_Option.Dot_Matches_Newline - process_override multiline Regex_Option.Multiline - process_override comments Regex_Option.Comments - - ## Add any non-overridable options from extra_opts - extra_opts.each opt-> - not_ascii = opt != Regex_Option.Ascii_Matching - not_insensitive = opt != Regex_Option.Case_Insensitive - not_dot_matches_newline = opt != Regex_Option.Dot_Matches_Newline - not_multiline = opt != Regex_Option.Multiline - not_comments = opt != Regex_Option.Comments - - if not_ascii && not_insensitive && not_dot_matches_newline && not_multiline && not_comments then - builder.append opt - - builder.to_vector - -## PRIVATE - - An error that is emitted when there is no such group in the match for the +## An error that is emitted when there is no such group in the match for the provided `id`. Arguments: @@ -134,46 +73,10 @@ type No_Such_Group _ : Integer -> "No group exists with the index " + self.id.to_text + "." _ : Text -> "No group exists with the name " + self.id + "." -## PRIVATE - - An error representing that one of the passed options was invalid. - - Arguments: - - opt: The option that was not valid for this regex engine. -type Invalid_Option - Error (opt : Any) - +## A syntax error reported by the Truffle regex compiler. +type Regex_Syntax_Error ## PRIVATE - Provides a human-readable representation of the invalid option error. - to_display_text : Text - to_display_text self = - "The option " + self.opt.to_text + " is not valid for the default regex engine." - -## PRIVATE - - An error representing that there is something wrong with the mode for a regex - match. - - Arguments: - - message: The text of the message to display to users. -type Mode_Error - Error (message : Text) - - ## PRIVATE - - Provides a human-readable representation of the mode error. - to_display_text : Text - to_display_text self = self.message.to_text - -## PRIVATE - - An error representing that the bounds for a match are invalid. -type Invalid_Bounds_Error - - ## PRIVATE - - Provides a human-readable representation of the invalid bounds error. - to_display_text : Text - to_display_text = - "The start bound cannot be greater than the end bound." + Arguments: + - message: A description of the erroneous syntax. + Error message diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine.enso deleted file mode 100644 index 97ea101752..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine.enso +++ /dev/null @@ -1,51 +0,0 @@ -## An `Engine` is a configuration and behaviour specification object for a - particular regular expression engine. - - An implementation of a regular expression engine must implement the below - interface, as well as conform to the following requirements: - - - The engine must operate in a unicode mode by default, using canonical - form for equality and the unicode versions of the standard character - classes. - - It must support the standard options specified in - `Standard.Base.Data.Text.Regex.Regex_Option`. It may specify additional, - engine-specific options, but this is not required by the specification. - - In the defining module, the engine implementation must provide a full - specification of its syntax in the module documentation block. - - This file is _not executable_. It instead describes the interface for the - customisable `Engine` and `Pattern` types. - -import project.Data.Text.Text -import project.Data.Text.Regex.Regex_Option.Regex_Option -import project.Data.Text.Regex.Invalid_Option -import project.Data.Text.Regex.Pattern.Pattern -import project.Data.Vector.Vector -import project.Errors.Common.Compile_Error -import project.Errors.Unimplemented.Unimplemented - -## The `Data.Text.Regex.Engine.Engine` interface. -type Engine - - ## PRIVATE - - Compile the provided `expression` into a regex pattern that can be used - for matching. - - Arguments - - expression: The text representing the regular expression that you want - to compile. - - options: The options to configure the matching process with. These are - merged with the specific `engine_opts`. - compile : Text -> Vector Regex_Option -> Pattern ! (Compile_Error | Invalid_Option) - compile self _ _ = Unimplemented.throw "This is an interface only." - - ## PRIVATE - - Escape the special characters in `expression` such that the result is a - valid literal pattern for the original string. - - Arguments: - - expression: The expression to escape metacharacters in. - escape : Text -> Text - escape self _ = Unimplemented.throw "This is an interface only." diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso deleted file mode 100644 index d202f83b8c..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Engine/Default.enso +++ /dev/null @@ -1,888 +0,0 @@ -## Enso's default regular expression matching engine. - - Enso's default regular expression engine uses Java's regular expression - syntax, extended with support for the unicode character classes and - properties. A detailed explanation of the syntax is below. - - ! Raw Strings - Enso has support for raw strings using the `""` quotes. Within a raw - string, all characters are interpreted to mean themselves. This means that - you do not need to double-escape special characters in regular expressions. - - ! Characters and Regex - When the default regex engine provdies a position with regards to - "characters", it is referring to positions in terms of the UTF-16 - characters in the text. These indices must be used to index into the - vector of UTF-16 characters. It will otherwise be wrong. - - ! Escaping - The backslash character `"\"` serves to introduce escaped constructs, as - defined in "Syntax Specification" below, as well as to quote characters - that would otherwise be interpreted as unescaped constructs. As a result, - the expression `"\\"` matches a single backslash, and `"\{"` matches an - opening brace. - - It is a parse error for the regular expression to use a backslash prior to - any alphabetic character that does not denote an escaped construct. It is, - however, valid to put a backslash before any symbolic character. - - ? Syntax Specification - The syntax supported by the default regular expression engine is described - here. The pattern described by the regular expression can then be used to - match against text. - - TBC - -import project.Any.Any -import project.Data.Map.Map -import project.Data.Numbers.Integer -import project.Data.Range.Extensions -import project.Data.Text.Matching_Mode.Matching_Mode -import project.Data.Text.Regex.Invalid_Option -import project.Data.Text.Regex.Invalid_Bounds_Error -import project.Data.Text.Regex.Mode_Error -import project.Data.Text.Regex.No_Such_Group -import project.Data.Text.Regex.Regex_Mode.Regex_Mode -import project.Data.Text.Regex.Regex_Option.Regex_Option -import project.Data.Text.Text -import project.Data.Text.Span.Utf_16_Span -import project.Data.Vector.Vector -import project.Meta -import project.Nothing.Nothing -import project.Panic.Panic - -from project.Data.Boolean import Boolean, True, False -from project.Errors.Common import Compile_Error, Syntax_Error - -polyglot java import java.lang.IllegalArgumentException -polyglot java import java.lang.IndexOutOfBoundsException -polyglot java import java.lang.StringBuffer -polyglot java import java.util.regex.Matcher as Java_Matcher -polyglot java import java.util.regex.Pattern as Java_Pattern -polyglot java import java.util.regex.PatternSyntaxException - -polyglot java import com.ibm.icu.impl.UnicodeRegex -polyglot java import org.enso.base.Regex_Utils -polyglot java import org.enso.base.Text_Utils - -## Construct an instance of the default engine. - - Arguments: - - opts: Any engine-specific options. - - > Example - Build a new default engine specifying literal mode. - - import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine - - example_new = - engine_opts = [Default_Engine.Option.Literal_Pattern] - Default_Engine.new engine_opts -new : Vector (Regex_Option | Option) -> Default_Engine -new opts=[] = Default_Engine.Value opts - -## The default implementation of the `Data.Text.Regex.Engine.Engine` interface. -type Default_Engine - - ## PRIVATE - - The default regex engine for Enso. - - Arguments: - - engine_opts: Options for regex matching that are specific to this - engine. - Value (engine_opts : Vector (Regex_Option | Option)) - - ## ADVANCED - - Compile the provided `expression` into a regex pattern that can be used - for matching. - - Arguments - - expression: The text representing the regular expression that you want - to compile. - - options: The options to configure the matching process with. These are - merged with the specific `engine_opts`. - - ? Why Compile? - While many regex engines are able to cache ad-hoc patterns, it is often - useful to be able to manually retain a pattern that you have computed. - This function exists so you can hold onto the resultant `Pattern` - object, instead of immediately proceeding to match using it. - - > Example - Compile the regex `"^a$"` in multiline mode so it matches all lines - consisting of a single "a". - - import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine - import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option - - example_compile = - expression = "^a$" - options = [Regex_Option.Multiline] - engine = Default_Engine.new - engine.compile expression options - compile : Text -> Vector (Regex_Option | Option) -> Pattern ! (Compile_Error | Invalid_Option) - compile self expression options = - all_options = options + self.engine_opts - options_bitmask = from_enso_options all_options - unicode_regex = UnicodeRegex.new - - maybe_java_pattern = Panic.recover Any <| - Java_Pattern.compile (unicode_regex.transform expression) options_bitmask - - internal_pattern = maybe_java_pattern.map_error case _ of - err : PatternSyntaxException -> Syntax_Error.Error ("The regex could not be compiled: " + err.getMessage) - other -> other - - Pattern.Value internal_pattern all_options self - - ## ADVANCED - - Escape the special characters in `expression` such that the result is a - valid literal pattern for the original string. - - Arguments: - - expression: The expression to escape metacharacters in. - - > Example - Turn a literal string into a regex that matches that string exactly. - - import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine - import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option - - example_escape = - literal_string = "\!\.|abcde" - engine = Default_Engine.new - engine.escape literal_string - escape : Text -> Text - escape self expression = Java_Pattern.quote expression - -## The default implementation of the `Data.Text.Regex.Engine.Pattern` interface. -type Pattern - - ## PRIVATE - - The default pattern type for Enso, produced by the default regex engine. - - Arguments: - - internal_pattern: The internal representation of the compiled pattern. - - options: The vector of options with which this pattern was built. - - engine: A handle to the engine that built this pattern. - Value (internal_pattern : Java_Pattern) (options : Vector (Regex_Option | Option)) (engine : Default_Engine) - - ## PRIVATE - - Constructs an internal matcher, settings the region as provided and - handling some additional options. - - Arguments: - - input: The text on which it will be matching. - - start: The start of the matcher's region. - - end: The end of the matcher's region. - - ! Unicode Normalization - The Regex engine used here handles string modifiers, like accents in a - weird way. The string "s\u{301}" will be treated as containing "s" - within it, but "ś" (which is canonically equivalent to the former one) - will not contain "s". To get consistent behavior that does not depend - on the encoding, we normalize all input. - build_matcher : Text -> Integer -> Integer -> Java_Matcher - build_matcher self input start end = - ## TODO [RW] Normalization had to be disabled - since start and end are - in code unit space, normalization could shift these indices! - This should be addressed when reviewing - See: https://www.pivotaltracker.com/story/show/181524498 - #normalized_input = if self.options.contains Regex_Option.Ascii_Matching then input else - # Text_Utils.normalize input - normalized_input = input - internal_matcher = self.internal_pattern.matcher normalized_input . region start end - - if self.options.contains Option.No_Anchoring_Bounds then - internal_matcher.useAnchoringBounds False - if self.options.contains Option.Transparent_Bounds then - internal_matcher.useTransparentBounds True - - internal_matcher - - ## ADVANCED - - Tries to match the provided `input` against the pattern `self`. - - Arguments: - - input: The text to match the pattern described by `self` against. - - mode: The matching mode to use. - - This method will _always_ return `Nothing` if it fails to match. - - ? Return Type - When asked to match in a mode that can only provide a single match, the - return type is either a single `Match` object. When asked to match in a - mode that permits multiple matches, it will always return a `Vector`, - even if only a single match is found. - - > Example - Match the first instance of the pattern `".."` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "abcdefghij" - pattern.match input mode=Matching_Mode.First - - > Example - Match up to the first 3 instances of the pattern `".."` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "abcdefghij" - pattern.match input mode=3 - - > Example - Match all instances of the pattern `".."` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "abcdefghij" - pattern.match input - - > Example - Check if the pattern `".*"` matches on the entire input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".*" [] - input = "abcdefghij" - pattern.match input mode=Regex_Mode.Full - match : Text -> (Regex_Mode | Matching_Mode) -> Match | Vector Match | Nothing - match self input mode=Regex_Mode.All = - do_match_mode mode start end = case mode of - Matching_Mode.First -> - internal_matcher = self.build_matcher input start end - - if internal_matcher . find start . not then Nothing else - Match.Value internal_matcher start end input - _ : Integer -> - if mode < 0 then Panic.throw <| - Mode_Error.Error "Cannot match a negative number of times." - - builder = Vector.new_builder - - go : Integer -> Integer -> Nothing - go offset remaining_count = - should_continue = remaining_count > 0 - if should_continue.not || (offset >= end) then Nothing else - internal_matcher = self.build_matcher input start end - found = internal_matcher.find offset - - if found.not then Nothing else - builder.append (Match.Value internal_matcher start end input) - match_end = internal_matcher.end 0 - # Ensure progress even if the match is an empty string. - new_offset = if match_end > offset then match_end else offset+1 - @Tail_Call go new_offset remaining_count-1 - - go start mode - vector = builder.to_vector - - if vector.is_empty then Nothing else vector - Regex_Mode.All -> - builder = Vector.new_builder - - go : Integer -> Nothing - go offset = - if offset >= end then Nothing else - internal_matcher = self.build_matcher input start end - found = internal_matcher.find offset - - if found.not then Nothing else - builder.append (Match.Value internal_matcher start end input) - match_end = internal_matcher.end 0 - # Ensure progress even if the match is an empty string. - new_offset = if match_end > offset then match_end else offset+1 - @Tail_Call go new_offset - - go start - vector = builder.to_vector - - if vector.is_empty then Nothing else vector - Regex_Mode.Full -> - internal_matcher = self.build_matcher input start end - if internal_matcher.matches.not then Nothing else - Match.Value internal_matcher start end input - Regex_Mode.Bounded _ _ _ -> Panic.throw <| - Mode_Error.Error "Modes cannot be recursive." - - case mode of - Regex_Mode.Bounded start end sub_mode -> - if start < end then do_match_mode sub_mode start end else - Panic.throw Invalid_Bounds_Error - _ -> do_match_mode mode 0 (Text_Utils.char_length input) - - ## ADVANCED - - Returns `True` if the input matches against the pattern described by - `self`, otherwise `False`. - - Arguments: - - input: The text to check for matching. - - > Example - Check if the input "aa" matches against the pattern `".."`. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "aa" - pattern.matches input - matches : Text -> Boolean - matches self input = case self.match input mode=Regex_Mode.Full of - _ : Match -> True - _ : Vector -> True - _ -> False - - ## ADVANCED - - Tries to find the text in the `input` that matches against the pattern - `self`. - - Arguments: - - input: The text to find matches in. - - mode: The matching mode to use. - - This method will _always_ return `Nothing` if it fails to find any - matches. - - ? Return Type - When asked to match in a mode that can only provide a single match, the - return type is either a single `Match` object. When asked to match in a - mode that permits multiple matches, it will always return a `Vector`, - even if only a single match is found. - - > Example - Find the first instance of the pattern `".."` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "abcdefghij" - pattern.find input mode=Matching_Mode.First - - > Example - Find up to the first 3 instances of the pattern `".."` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "abcdefghij" - pattern.find input mode=3 - - > Example - Find all instances of the pattern `".."` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "abcdefghij" - pattern.find input - - > Example - Find if the pattern `".*"` matches on the entire input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".*" [] - input = "abcdefghij" - pattern.find input mode=Regex_Mode.Full - find : Text -> (Regex_Mode | Matching_Mode) -> Text | Vector Text | Nothing - find self input mode=Regex_Mode.All = - matches = self.match input mode - case matches of - _ : Match -> matches.group 0 - _ : Vector -> matches.map (_.group 0) - _ -> matches - - ## ADVANCED - - Splits the `input` text based on the pattern described by `self`. - - Arguments: - - input: The text to splut based on the pattern described by `self`. - - mode: The splitting mode to use. - - This method will _always_ return a vector. If no splits take place, the - vector will contain a single element. - - > Example - Split the input on the first instance of the pattern `"aa"`. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile "aa" [] - input = "abaaabbaabba" - pattern.match input mode=Matching_Mode.First - - > Example - Split on up to the first 3 instances of the pattern `"a"` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile "a" [] - input = "bacadaeaf" - pattern.match input mode=3 - - > Example - Split on all all instances of the pattern `"a"` in the input. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile "a" [] - input = "bacadaeaf" - pattern.match input - split : Text -> Matching_Mode | Regex_Mode | Integer -> Vector Text - split self input mode=Regex_Mode.All = - # Java uses this to mean the max length of the resulting array, so we - # add 1. - limit = case mode of - Matching_Mode.First -> 2 - _ : Integer -> - if mode < 0 then Panic.throw <| - Mode_Error.Error "Cannot match a negative number of times." - - mode + 1 - Regex_Mode.All -> -1 - Regex_Mode.Full -> Panic.throw <| - Mode_Error.Error "Splitting on a full match yields an empty text." - Regex_Mode.Bounded _ _ _ -> Panic.throw <| - Mode_Error.Error "Splitting on a bounded region is not well-defined." - Matching_Mode.Last -> Panic.throw <| - Mode_Error.Error "Splitting on the last match is not supported." - - splits = self.internal_pattern.split input limit - Vector.from_polyglot_array splits - - ## ADVANCED - - Replace all occurrences of the pattern described by `self` in the `input` - with the specified `replacement`. - - Arguments: - - input: The text in which to perform the replacement(s). - - replacement: The literal text with which to replace any matches. - - mode: The matching mode to use for finding candidates to replace. - - If this method performs no replacements it will return the `input` text - unchanged. - - > Example - Replace the first occurrence of the pattern `".."` in the input with - the text `"REPLACED"`. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile ".." [] - input = "abcdefghij" - pattern.replace input "REPLACED" mode=Matching_Mode.First - - > Example - Replace up to the first 3 instances of the pattern `"aa"` in the input - with the text `"REPLACED"`. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile "aa" [] - input = "aabbaaaabb" - pattern.replace input "REPLACED" mode=3 - - > Example - Replace all instances of the pattern `"aa"` in the input with the text - `"REPLACED"`. - - import Standard.Base.Data.Text.Regex.Engine.Default - - example_match = - engine = Default.new - pattern = engine.compile "aa [] - input = "aabbaabbbbbaab" - pattern.replace input "REPLACED" - replace : Text -> Text -> Regex_Mode | Matching_Mode | Integer -> Text - replace self input replacement mode=Regex_Mode.All = - do_replace_mode mode start end = case mode of - Matching_Mode.First -> - internal_matcher = self.build_matcher input start end - internal_matcher.replaceFirst replacement - _ : Integer -> - if mode < 0 then Panic.throw <| - Mode_Error.Error "Cannot replace a negative number of times." - - internal_matcher = self.build_matcher input start end - buffer = StringBuffer.new - - go remaining_replacements = - if (internal_matcher.find) && (remaining_replacements > 0) then - internal_matcher.appendReplacement buffer replacement - @Tail_Call go (remaining_replacements - 1) - - go mode - internal_matcher.appendTail buffer - buffer.to_text - Regex_Mode.All -> - internal_matcher = self.build_matcher input start end - internal_matcher.replaceAll replacement - Regex_Mode.Full -> - case self.match input mode=Regex_Mode.Full of - _ : Match -> self.replace input replacement Matching_Mode.First - Nothing -> input - Matching_Mode.Last -> - all_matches = self.match input - all_matches_count = if all_matches.is_nothing then 0 else all_matches.length - - if all_matches_count == 0 then input else - internal_matcher = self.build_matcher input start end - buffer = StringBuffer.new - last_match_index = all_matches_count - 1 - - go match_index = - internal_matcher.find - case match_index == last_match_index of - True -> internal_matcher.appendReplacement buffer replacement - False -> @Tail_Call go (match_index + 1) - - go 0 - internal_matcher.appendTail buffer - buffer.to_text - Regex_Mode.Bounded _ _ _ -> Panic.throw <| - Mode_Error.Error "Modes cannot be recursive." - - case mode of - Regex_Mode.Bounded _ _ _ -> Panic.throw <| - Mode_Error.Error "Bounded replacements are not well-formed." - _ -> do_replace_mode mode 0 (Text_Utils.char_length input) - -## The default implementation of the `Data.Text.Regex.Engine.Match` interface. -type Match - - ## PRIVATE - - A representation of a regular expression match. - - Arguments: - - internal_match: The internal representation of the regular expression - match. - - region_start: The start of the region over which the match was made. - - region_end: The end of the region over which the match was made. - - input: The input text that was being matched. - Value (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text) - - ## Gets the text matched by the group with the provided identifier, or - `Nothing` if the group did not participate in the match. If no such group - exists for the provided identifier, a `No_Such_Group` is returned. - - Arguments: - - id: The index or name of that group. - - ? The Full Match - The group with index 0 is always the full match of the pattern. - - ? Named Groups by Index - If the regex contained named groups, these may also be accessed by - index based on their position in the pattern. - - > Example - Get the text of the group with the index 0. - - import Standard.Examples - - example_group = - match = Examples.match - match.group 0 - - > Example - Get the text of the group with the name "letters". - - import Standard.Examples - - example_group = - match = Examples.match - match.group "letters" - group : Integer | Text -> Text | Nothing ! No_Such_Group - group self id = - Panic.recover Any (self.internal_match.group id) . map_error (handle_error _ id) - - ## Gets a vector containing the results of _all_ of the capturing groups in - the pattern, replacing the value of groups that did not participate in - the match with `default`. - - Arguments: - - default: The value to return for a given index when the group at that - index did not participate in the match. - - ? The Full Match - The group with index 0 is always the full match of the pattern. - - ? Named Groups by Index - If the regex contained named groups, these may also be accessed by - index based on their position in the pattern. - - > Example - Get a vector of the text matched by all of the groups in this match, - replacing the value for groups that didn't match with "UNMATCHED". - - import Standard.Examples - - example_groups = - match = Examples.match - match.groups default="UNMATCHED" - groups : Any -> Vector (Text | Any) - groups self default=Nothing = - group_numbers = 0.up_to self.internal_match.groupCount+1 - group_numbers.map n-> - case self.group n of - Nothing -> default - a -> a - - ## Gets a map containing the named capturing groups for the pattern, - replacing the value for groups that did not participate in the match with - `default`. - - Arguments: - - default: The value to return for a given name when the group at that - index did not participate in the match. - - > Example - Get the map of all of the named groups in this match, replacing the - value for groups that didn't match with "UNMATCHED". - - import Standard.Examples - - example_groups = - match = Examples.match - matcg.named_groups default="UNMATCHED" - named_groups : Any -> Map Text (Text | Any) - named_groups self default=Nothing = - group_names = Vector.from_polyglot_array <| - Regex_Utils.get_group_names self.internal_match.pattern - pairs = group_names.map name-> - value = case self.group name of - Nothing -> default - a -> a - [name, value] - Map.from_vector pairs - - ## Gets the index of the first character captured by the group with the - given identifier, or `Nothing` if the group did not participate in the - match. - - Arguments: - - id: The identifier for the group to fetch the start index for. - - ! What is a Character? - This regular expression engine defines a "character" to mean a UTF-16 - character. This means that these indices should only be used with the - result of calling `.char_vector` on the text. Using them with - `.characters` or `.codepoints` will produce incorrect results. - - > Example - Get the start index in the input where the full pattern matched for - this match. - - import Standard.Examples - - example_start = - match = Examples.match - match.start 0 - start : Integer | Text -> Integer | Nothing ! No_Such_Group - start self id = - result = Panic.recover Any (self.internal_match.start id) - no_errors = result.map_error (handle_error _ id) - if no_errors == -1 then Nothing else no_errors - - ## Gets the index of the first character after `start` that was not captured - by the group with the given identifier, or `Nothing` if the group did not - participate in the match. - - Arguments: - - id: The identifier for the group to fetch the end index for. - - ! What is a Character? - This regular expression engine defines a "character" to mean a UTF-16 - character. This means that these indices should only be used with the - result of calling `.char_vector` on the text. Using them with - `.characters` or `.codepoints` will produce incorrect results. - - > Example - Get the end index in the input where the full pattern matched for this - match. - - import Standard.Examples - - example_end = - match = Examples.match - match.end 0 - end : Integer | Text -> Integer | Nothing ! No_Such_Group - end self id = - result = Panic.recover Any (self.internal_match.end id) - no_errors = result.map_error (handle_error _ id) - if no_errors == -1 then Nothing else no_errors - - ## Returns the span matched by the group with the provided identifier, or - `Nothing` if the group did not participate in the match. - - Arguments: - - id: The identifier for the group to fetch the end index for. - - ! What is a Character? - This regular expression engine defines a "character" to mean a UTF-16 - character. This means that these indices should only be used with the - result of calling `.char_vector` on the text. Using them with - `.characters` or `.codepoints` will produce incorrect results. - - > Example - Get the span over the input that was matched by the full match. - - import Standard.Examples - - example_Span = - match = Examples.match - match.span 0 - span : Integer | Text -> Utf_16_Span | Nothing ! No_Such_Group - span self id = case self.group id of - Nothing -> Nothing - _ -> Utf_16_Span.Value ((self.start id).up_to (self.end id)) self.input - - ## Returns the start character index of the match's region. - - ! What is a Character? - This regular expression engine defines a "character" to mean a UTF-16 - character. This means that these indices should only be used with the - result of calling `.char_vector` on the text. Using them with - `.characters` or `.codepoints` will produce incorrect results. - - > Example - Get the start position in the input to which this match was limited. - - import Standard.Examples - - example_start_position = - match = Examples.match - match.start_position - start_position : Integer - start_position self = self.region_start - - ## Returns the end character index of the match's region. - - ! What is a Character? - This regular expression engine defines a "character" to mean a UTF-16 - character. This means that these indices should only be used with the - result of calling `.char_vector` on the text. Using them with - `.characters` or `.codepoints` will produce incorrect results. - - > Example - Get the end position in the input to which this match was limited. - - import Standard.Examples - - example_end_position = - match = Examples.match - match.end_position - end_position : Integer - end_position self = self.region_end - -## PRIVATE - - Handle errors when looking up group info. - - Arguments: - - error: The error as a value. - - id: The group identifier with which the error is associated. -handle_error : Any -> (Text | Integer) -> Any -handle_error error id = case error of - _ : IndexOutOfBoundsException -> No_Such_Group.Error id - _ : IllegalArgumentException -> No_Such_Group.Error id - other -> other - -## Options specific to the `Default` regular expression engine. -type Option - - ## Specifies that the input expression to the pattern be treated as a - sequence of literal characters. Metacharacters and escape sequences have - no special meaning in this mode. - Literal_Pattern - - ## Disables anchoring to the region's boundaries. - - By default, the regex engine will allow `^` and `$` to match the - boundaries of a restricted region. With this option specified, they will - only match the start and end of the input. - No_Anchoring_Bounds - - ## Enables transparent bounds. - - Setting this option will allow the regex engine to look "through" the - boundaries of the engine's region for the purposes of lookahead, - lookbehind, and boundary matching. - - Without this flag, the region boundaries are treated as opaque, meaning - that the above constructs will fail to match anything outside the region. - Transparent_Bounds - - ## Specifies that only the unix line ending `''\n'` be considered in the - behaviour of the `^` and `$` special characters. - Unix_Lines - -## PRIVATE - - Generates a Java bitmask representing the options used to configure the - regex. - - Arguments: - - opts: The enso-side options to configure the regex. -from_enso_options : Vector (Option | Regex_Option) -> Integer -from_enso_options opts = - java_flags = Panic.recover Any <| opts.flat_map case _ of - Option.Literal_Pattern -> [Java_Pattern.LITERAL] - Option.Unix_Lines -> [Java_Pattern.UNIX_LINES] - Option.No_Anchoring_Bounds -> [] - Option.Transparent_Bounds -> [] - Regex_Option.Case_Insensitive -> [Java_Pattern.CASE_INSENSITIVE] - Regex_Option.Dot_Matches_Newline -> [Java_Pattern.DOTALL] - Regex_Option.Multiline -> [Java_Pattern.MULTILINE] - Regex_Option.Comments -> [Java_Pattern.COMMENTS] - Regex_Option.Ascii_Matching -> [] - other -> Panic.throw (Invalid_Option.Error other) - - options_bitmask = java_flags.fold 0 .bit_or - - if opts.contains Regex_Option.Ascii_Matching then options_bitmask else - unicode = [Java_Pattern.CANON_EQ, Java_Pattern.UNICODE_CASE, Java_Pattern.UNICODE_CHARACTER_CLASS].fold 0 .bit_or - options_bitmask.bit_or unicode diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match.enso index 6dd855a068..243ec3d1a9 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match.enso @@ -1,24 +1,143 @@ import project.Any.Any import project.Data.Map.Map import project.Data.Numbers.Integer -import project.Data.Text.Span.Span -import project.Data.Text.Text +import project.Data.Range.Extensions +import project.Data.Range.Range import project.Data.Text.Regex.No_Such_Group +import project.Data.Text.Span.Span +import project.Data.Text.Span.Utf_16_Span +import project.Data.Text.Text import project.Data.Vector.Vector -import project.Errors.Unimplemented.Unimplemented +import project.Error.Error +import project.Errors.Common.Index_Out_Of_Bounds import project.Nothing.Nothing +import project.Panic.Panic + +from project.Data.Boolean import Boolean, True, False + -## The `Data.Text.Regex.Engine.Match` interface. type Match + ## PRIVATE + internal_regex_result : RegexResult (Truffle) + (See https://github.com/oracle/graal/blob/master/regex/docs/README.md) + Value (pattern : Pattern) (internal_regex_result : Any) (input : Text) ## PRIVATE + Returns the start UTF16 character index of a group. - Gets the text matched by the group with the provided identifier, or - `Nothing` if the group did not participate in the match. If no such group + This method goes directly to the internal match object. It does not + take group names, and does not have a default. + + Arguments: + - group: the integer group number. + internal_start : Integer -> Integer + internal_start self group = self.internal_regex_result.getStart group + + ## PRIVATE + Returns the end UTF16 character index, plus one, of a group. + + This method goes directly to the internal match object. It does not + take group names, and does not have a default. + + Arguments: + - group: the integer group number. + internal_end : Integer -> Integer + internal_end self group = self.internal_regex_result.getEnd group + + ## Returns the start UTF16 character index of a group. + + Arguments: + - group: the group name or number. Marked groups defined in the regex are + numbered starting at 1; group 0 refers to the entire match. + utf_16_start : Integer | Text -> Integer + utf_16_start self group=0 = + span = self.utf_16_span group + if span.is_nothing then Nothing else span.start + + ## Returns the end UTF16 character index, plus one, of a group. + + Arguments: + - group: the group name or number. Marked groups defined in the regex are + numbered starting at 1; group 0 refers to the entire match. + utf_16_end : Integer | Text -> Integer + utf_16_end self group=0 = + span = self.utf_16_span group + if span.is_nothing then Nothing else span.end + + ## Returns the start grapheme index of a group. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + Arguments: + - group: the group name or number. Marked groups defined in the regex are + numbered starting at 1; group 0 refers to the entire match. + start : Integer | Text -> Integer + start self group=0 = + span = self.span group + if span.is_nothing then Nothing else span.start + + ## Returns the end grapheme index, plus one, of a group. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + Arguments: + - group: the group name or number. Marked groups defined in the regex are + numbered starting at 1; group 0 refers to the entire match. + end : Integer | Text -> Integer + end self group=0 = + span = self.span group + if span.is_nothing then Nothing else span.end + + ## Gets the UTF16 span matched by the group with the provided identifier, or + a default value if the group did not participate in the match. If no such + group exists for the provided identifier, a `No_Such_Group` is returned. + + Arguments: + - group: The integer index or name of that group. + + ? The Full Match + The group with index 0 is always the full match of the pattern. + + ? Named Groups by Index + If the regex contained named groups, these may also be accessed by + index based on their position in the pattern. + + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. + + Note that it is possible for a group to "not participate in the match", + for example with a disjunction. In the example below, the "(d)" group + does not participate -- it neither matches nor fails. + + "ab((c)|(d))".find "abc" + + In this case, the group id for "(d)", which is 3, is a valid group id and + (Pattern.lookup_group 3) will return 3. If the caller tries to get group 3, + Match.utf_16_span will return the default value. + utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group + utf_16_span self group=0 ~default=Nothing = + group_id = self.pattern.lookup_group group + start = self.internal_start group_id + end = self.internal_end group_id + does_not_participate = start == -1 || end == -1 + if does_not_participate then default else + range = Range.new start end + Utf_16_Span.Value range self.input + + ## Gets the grapheme span matched by the group with the provided identifier, or + a default value if the group did not participate in the match. If no such group exists for the provided identifier, a `No_Such_Group` is returned. Arguments: - - id: The index or name of that group. + - group: The integer index or name of that group. ? The Full Match The group with index 0 is always the full match of the pattern. @@ -26,19 +145,62 @@ type Match ? Named Groups by Index If the regex contained named groups, these may also be accessed by index based on their position in the pattern. - group : Integer | Text -> Text | Nothing ! No_Such_Group - group self _ = Unimplemented.throw "This is an interface only." - ## PRIVATE + ! What is a Character? + A character is defined as an Extended Grapheme Cluster, see Unicode + Standard Annex 29. This is the smallest unit that still has semantic + meaning in most text-processing applications. - Gets a vector containing the results of _all_ of the capturing groups in + Note that it is possible for a group to "not participate in the match", + for example with a disjunction. In the example below, the "(d)" group + does not participate -- it neither matches nor fails. + + "ab((c)|(d))".find "abc" + + In this case, the group id for "(d)", which is 3, is a valid group id and + (Pattern.lookup_group 3) will return 3. If the caller tries to get + group 3, Match.span will return the default value. + span : Integer | Text -> Any -> Span ! No_Such_Group + span self group=0 ~default=Nothing = + result = self.utf_16_span group Nothing + if result.is_nothing then default else result.to_grapheme_span + + ## Gets the Text matched by the group with the provided identifier, or + a default value if the group did not participate in the match. If no such + group exists for the provided identifier, a `No_Such_Group` is returned. + + Arguments: + - group: The integer index or name of that group. + + ? The Full Match + The group with index 0 is always the full match of the pattern. + + ? Named Groups by Index + If the regex contained named groups, these may also be accessed by + index based on their position in the pattern. + + Note that it is possible for a group to "not participate in the match", + for example with a disjunction. In the example below, the "(d)" group + does not participate -- it neither matches nor fails. + + "ab((c)|(d))".find "abc" + + In this case, the group id for "(d)", which is 3, is a valid group id and + (Pattern.lookup_group 3) will return 3. If the caller tries to get + group 3, Match.text will return the default value. + text : Integer | Text -> Any -> Text ! No_Such_Group + text self group=0 ~default=Nothing = + result = self.span group Nothing + if result.is_nothing then default else result.text + + ## Gets a vector containing the Text of _all_ of the capturing groups in the pattern, replacing the value of groups that did not participate in - the match with `default`. + the match with `default`. This vector includes group 0, which contains + the entire match. Arguments: - default: The value to return for a given index when the group at that - index did not participate in the match. The default for this argument - should be `Nothing`. + index did not participate in the match. ? The Full Match The group with index 0 is always the full match of the pattern. @@ -46,60 +208,81 @@ type Match ? Named Groups by Index If the regex contained named groups, these may also be accessed by index based on their position in the pattern. + + Note that it is possible for a group to "not participate in the match", + for example with a disjunction. In the example below, the "(d)" group + does not participate -- it neither matches nor fails. + + "ab((c)|(d))".find "abc" + + In this case, the group id for "(d)", which is 3, is a valid group id and + (Pattern.lookup_group 3) will return 3. `groups` will return the + default value for groups that do not participate. + + > Example + Get a vector of the text matched by all of the groups in this match, + replacing the value for groups that didn't match with "UNMATCHED". + + import Standard.Examples + + example_groups = + match = Examples.match + match.groups default="UNMATCHED" groups : Any -> Vector (Text | Any) - groups self _ = Unimplemented.throw "This is an interface only." + groups self ~default=Nothing = + group_numbers = 0.up_to self.pattern.group_count + group_numbers.map n-> (self.text n . if_nothing default) - ## PRIVATE - - Gets a map containing the named capturing groups for the pattern, + ## Gets a map containing the named capturing groups for the pattern, replacing the value for groups that did not participate in the match with `default`. Arguments: - default: The value to return for a given name when the group at that - index did not participate in the match. This should default to - `Nothing`. + index did not participate in the match. + + Note that it is possible for a group to "not participate in the match", + for example with a disjunction. In the example below, the "(d)" group + does not participate -- it neither matches nor fails. + + "ab((c)|(d))".find "abc" + + In this case, the group id for "(d)", which is 3, is a valid group id and + (Pattern.lookup_group 3) will return 3. `named_groups` will map + a named group that does not participate to the default value. + + > Example + Get the map of all of the named groups in this match, replacing the + value for groups that didn't participate in the match with "UNMATCHED". + + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + ## match.named_groups.keys.sort == ["empty", "letters"] named_groups : Any -> Map Text (Text | Any) - named_groups self _ = Unimplemented.throw "This is an interface only." + named_groups self default=Nothing = + named_group_names = self.pattern.group_names + spans = named_group_names.map name-> self.text name default=default + Map.from_vector (named_group_names.zip spans) - ## PRIVATE - - Gets the index of the first character captured by the group with the - given identifier, or `Nothing` if the group did not participate in the - match. + ## Gets the grapheme span matched by the group with the provided index, or + a default value if the group did not participate in the match. + If the identifier is invalid then `if_missing` is returned. Arguments: - - id: The identifier for the group to fetch the start index for. - start : Integer | Text -> Integer | Nothing ! No_Such_Group - start self _ = Unimplemented.throw "This is an interface only." + - id: The integer index or name of that group. + - if_missing: The value to return if the index is out of bounds. + get : Integer -> Any -> Text | Any + get self index ~if_missing=Nothing = + self.text index . catch No_Such_Group (_-> if_missing) - ## PRIVATE - - Gets the index of the first character after `start` that was not captured - by the group with the given identifier, or `Nothing` if the group did not - participate in the match. + ## Gets the grapheme span matched by the group with the provided index, or + a default value if the group did not participate in the match. + If the identifier is invalid then Index_Out_Of_Bounds is thrown. Arguments: - - id: The identifier for the group to fetch the end index for. - end : Integer | Text -> Integer | Nothing ! No_Such_Group - end self _ = Unimplemented.throw "This is an intercace only." - - ## PRIVATE - - Returns the span matched by the group with the provided identifier, or - `Nothing` if the group did not participate in the match. - - Arguments: - - id: The identifier for the group to fetch the end index for. - span : Integer | Text -> Span | Nothing ! No_Such_Group - span self _ = Unimplemented.throw "This is an interface only." - - ## PRIVATE - - Returns the start character index of the match's region. - start_position : Integer - start_position self = Unimplemented.throw "This is an interface only." - - ## Returns the end character index of the match's region. - end_position : Integer - end_position self = Unimplemented.throw "This is an interface only." + - id: The integer index or name of that group. + - if_missing: The value to return if the index is out of bounds. + at : Integer -> Text ! Index_Out_Of_Bounds + at self index = + self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count)) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso deleted file mode 100644 index 8be1b068d1..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso +++ /dev/null @@ -1,287 +0,0 @@ -import project.Any.Any -import project.Data.Map.Map -import project.Data.Numbers.Integer -import project.Data.Range.Extensions -import project.Data.Range.Range -import project.Data.Text.Regex_2.No_Such_Group -import project.Data.Text.Span.Span -import project.Data.Text.Span.Utf_16_Span -import project.Data.Text.Text -import project.Data.Vector.Vector -import project.Error.Error -import project.Errors.Common.Index_Out_Of_Bounds -import project.Nothing.Nothing -import project.Panic.Panic - -from project.Data.Boolean import Boolean, True, False - - -type Match_2 - ## internal_regex_result : RegexResult (Truffle) - (See https://github.com/oracle/graal/blob/master/regex/docs/README.md) - Value (pattern : Pattern_2) (internal_regex_result : Any) (input : Text) - - ## PRIVATE - Returns the start UTF16 character index of a group. - - This method goes directly to the internal match object. It does not - take group names, and does not have a default. - - Arguments: - - group: the integer group number. - internal_start : Integer -> Integer - internal_start self group = self.internal_regex_result.getStart group - - ## PRIVATE - Returns the end UTF16 character index, plus one, of a group. - - This method goes directly to the internal match object. It does not - take group names, and does not have a default. - - Arguments: - - group: the integer group number. - internal_end : Integer -> Integer - internal_end self group = self.internal_regex_result.getEnd group - - ## Returns the start UTF16 character index of a group. - - Arguments: - - group: the group name or number. Marked groups defined in the regex are - numbered starting at 1; group 0 refers to the entire match. - utf_16_start : Integer | Text -> Integer - utf_16_start self group=0 = - span = self.utf_16_span group - if span.is_nothing then Nothing else span.start - - ## Returns the end UTF16 character index, plus one, of a group. - - Arguments: - - group: the group name or number. Marked groups defined in the regex are - numbered starting at 1; group 0 refers to the entire match. - utf_16_end : Integer | Text -> Integer - utf_16_end self group=0 = - span = self.utf_16_span group - if span.is_nothing then Nothing else span.end - - ## Returns the start grapheme index of a group. - - ! What is a Character? - A character is defined as an Extended Grapheme Cluster, see Unicode - Standard Annex 29. This is the smallest unit that still has semantic - meaning in most text-processing applications. - - Arguments: - - group: the group name or number. Marked groups defined in the regex are - numbered starting at 1; group 0 refers to the entire match. - start : Integer | Text -> Integer - start self group=0 = - span = self.span group - if span.is_nothing then Nothing else span.start - - ## Returns the end grapheme index, plus one, of a group. - - ! What is a Character? - A character is defined as an Extended Grapheme Cluster, see Unicode - Standard Annex 29. This is the smallest unit that still has semantic - meaning in most text-processing applications. - - Arguments: - - group: the group name or number. Marked groups defined in the regex are - numbered starting at 1; group 0 refers to the entire match. - end : Integer | Text -> Integer - end self group=0 = - span = self.span group - if span.is_nothing then Nothing else span.end - - ## Gets the UTF16 span matched by the group with the provided identifier, or - a default value if the group did not participate in the match. If no such - group exists for the provided identifier, a `No_Such_Group` is returned. - - Arguments: - - group: The integer index or name of that group. - - ? The Full Match - The group with index 0 is always the full match of the pattern. - - ? Named Groups by Index - If the regex contained named groups, these may also be accessed by - index based on their position in the pattern. - - ! What is a Character? - A character is defined as an Extended Grapheme Cluster, see Unicode - Standard Annex 29. This is the smallest unit that still has semantic - meaning in most text-processing applications. - - Note that it is possible for a group to "not participate in the match", - for example with a disjunction. In the example below, the "(d)" group - does not participate -- it neither matches nor fails. - - "ab((c)|(d))".find "abc" - - In this case, the group id for "(d)", which is 3, is a valid group id and - (Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3, - Match_2.utf_16_span will return the default value. - utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group - utf_16_span self group=0 ~default=Nothing = - group_id = self.pattern.lookup_group group - start = self.internal_start group_id - end = self.internal_end group_id - does_not_participate = start == -1 || end == -1 - if does_not_participate then default else - range = Range.new start end - Utf_16_Span.Value range self.input - - ## Gets the grapheme span matched by the group with the provided identifier, or - a default value if the group did not participate in the match. If no such group - exists for the provided identifier, a `No_Such_Group` is returned. - - Arguments: - - group: The integer index or name of that group. - - ? The Full Match - The group with index 0 is always the full match of the pattern. - - ? Named Groups by Index - If the regex contained named groups, these may also be accessed by - index based on their position in the pattern. - - ! What is a Character? - A character is defined as an Extended Grapheme Cluster, see Unicode - Standard Annex 29. This is the smallest unit that still has semantic - meaning in most text-processing applications. - - Note that it is possible for a group to "not participate in the match", - for example with a disjunction. In the example below, the "(d)" group - does not participate -- it neither matches nor fails. - - "ab((c)|(d))".find "abc" - - In this case, the group id for "(d)", which is 3, is a valid group id and - (Pattern_2.lookup_group 3) will return 3. If the caller tries to get - group 3, Match_2.span will return the default value. - span : Integer | Text -> Any -> Span ! No_Such_Group - span self group=0 ~default=Nothing = - result = self.utf_16_span group Nothing - if result.is_nothing then default else result.to_grapheme_span - - ## Gets the Text matched by the group with the provided identifier, or - a default value if the group did not participate in the match. If no such - group exists for the provided identifier, a `No_Such_Group` is returned. - - Arguments: - - group: The integer index or name of that group. - - ? The Full Match - The group with index 0 is always the full match of the pattern. - - ? Named Groups by Index - If the regex contained named groups, these may also be accessed by - index based on their position in the pattern. - - Note that it is possible for a group to "not participate in the match", - for example with a disjunction. In the example below, the "(d)" group - does not participate -- it neither matches nor fails. - - "ab((c)|(d))".find "abc" - - In this case, the group id for "(d)", which is 3, is a valid group id and - (Pattern_2.lookup_group 3) will return 3. If the caller tries to get - group 3, Match_2.text will return the default value. - text : Integer | Text -> Any -> Text ! No_Such_Group - text self group=0 ~default=Nothing = - result = self.span group Nothing - if result.is_nothing then default else result.text - - ## Gets a vector containing the Text of _all_ of the capturing groups in - the pattern, replacing the value of groups that did not participate in - the match with `default`. This vector includes group 0, which contains - the entire match. - - Arguments: - - default: The value to return for a given index when the group at that - index did not participate in the match. - - ? The Full Match - The group with index 0 is always the full match of the pattern. - - ? Named Groups by Index - If the regex contained named groups, these may also be accessed by - index based on their position in the pattern. - - Note that it is possible for a group to "not participate in the match", - for example with a disjunction. In the example below, the "(d)" group - does not participate -- it neither matches nor fails. - - "ab((c)|(d))".find "abc" - - In this case, the group id for "(d)", which is 3, is a valid group id and - (Pattern_2.lookup_group 3) will return 3. `groups` will return the - default value for groups that do not participate. - - > Example - Get a vector of the text matched by all of the groups in this match, - replacing the value for groups that didn't match with "UNMATCHED". - - import Standard.Examples - - example_groups = - match = Examples.match - match.groups default="UNMATCHED" - groups : Any -> Vector (Text | Any) - groups self ~default=Nothing = - group_numbers = 0.up_to self.pattern.group_count - group_numbers.map n-> (self.text n . if_nothing default) - - ## Gets a map containing the named capturing groups for the pattern, - replacing the value for groups that did not participate in the match with - `default`. - - Arguments: - - default: The value to return for a given name when the group at that - index did not participate in the match. - - Note that it is possible for a group to "not participate in the match", - for example with a disjunction. In the example below, the "(d)" group - does not participate -- it neither matches nor fails. - - "ab((c)|(d))".find "abc" - - In this case, the group id for "(d)", which is 3, is a valid group id and - (Pattern_2.lookup_group 3) will return 3. `named_groups` will map - a named group that does not participate to the default value. - - > Example - Get the map of all of the named groups in this match, replacing the - value for groups that didn't participate in the match with "UNMATCHED". - - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - ## match.named_groups.keys.sort == ["empty", "letters"] - named_groups : Any -> Map Text (Text | Any) - named_groups self default=Nothing = - named_group_names = self.pattern.group_names - spans = named_group_names.map name-> self.text name default=default - Map.from_vector (named_group_names.zip spans) - - ## Gets the grapheme span matched by the group with the provided index, or - a default value if the group did not participate in the match. - If the identifier is invalid then `if_missing` is returned. - - Arguments: - - id: The integer index or name of that group. - - if_missing: The value to return if the index is out of bounds. - get : Integer -> Any -> Text | Any - get self index ~if_missing=Nothing = - self.text index . catch No_Such_Group (_-> if_missing) - - ## Gets the grapheme span matched by the group with the provided index, or - a default value if the group did not participate in the match. - If the identifier is invalid then Index_Out_Of_Bounds is thrown. - - Arguments: - - id: The integer index or name of that group. - - if_missing: The value to return if the index is out of bounds. - at : Integer -> Text ! Index_Out_Of_Bounds - at self index = - self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count)) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern.enso index 0a2e6a8f54..013b871738 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern.enso @@ -1,78 +1,183 @@ -import project.Data.Boolean.Boolean +import project.Any.Any +import project.Data.Filter_Condition.Filter_Condition +import project.Data.Map.Map import project.Data.Numbers.Integer -import project.Data.Text.Matching_Mode.Matching_Mode +import project.Data.Range.Extensions +import project.Data.Range.Range +import project.Data.Text.Helpers +import project.Data.Text.Span.Span +import project.Data.Text.Span.Utf_16_Span import project.Data.Text.Regex.Match.Match -import project.Data.Text.Regex.Regex_Mode.Regex_Mode +import project.Data.Text.Regex.No_Such_Group +import project.Data.Text.Regex.Replacer.Replacer import project.Data.Text.Text import project.Data.Vector.Vector -import project.Errors.Unimplemented.Unimplemented +import project.Errors.Common.Type_Error +import project.Error.Error +import project.Errors.Illegal_Argument.Illegal_Argument +import project.Meta import project.Nothing.Nothing +import project.Polyglot.Polyglot + +from project.Data.Boolean import Boolean, True, False +from project.Data.Index_Sub_Range import sort_and_merge_ranges + +polyglot java import org.enso.base.Replacer_Cache +polyglot java import org.enso.base.Text_Utils -## The `Data.Text.Regex.Engine.Pattern` interface. type Pattern + ## internal_regex_object : RegexObject (Truffle) + (See https://github.com/oracle/graal/blob/master/regex/docs/README.md) + Value (internal_regex_object : Any) - ## PRIVATE - - Tries to match the provided `input` against the pattern `self`. - - Arguments: - - input: The text to match the pattern described by `self` against. - - mode: The matching mode to use. This must default to `Regex_Mode.All`. - - This method will _always_ return `Nothing` if it fails to match. - - ? Return Type - When asked to match in a mode that can only provide a single match, the - return type is either a single `Match` object. When asked to match in a - mode that permits multiple matches, it will always return a `Vector`, - even if only a single match is found. - match : Text -> (Regex_Mode | Matching_Mode) -> Match | Vector Match | Nothing - match self _ _ = Unimplemented.throw "This is an interface only." - - ## PRIVATE - - Returns `True` if the input matches against the pattern described by + ## Returns `True` if the input matches against the pattern described by `self`, otherwise `False`. Arguments: - input: The text to check for matching. - matches : Text -> Boolean - matches self _ = Unimplemented.throw "This is an interface only." + matches : Text -> Boolean | Type_Error + matches self input = + Helpers.expect_text input <| + m = self.internal_regex_object.exec input 0 + m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length - ## PRIVATE + ## Tries to match the provided `input` against the pattern `self`. - Tries to find the text in the `input` that matches against the pattern - `self`. + Returns a `Match` containing the matched text and its match groups, or + `Nothing` if the match failed. Arguments: - - input: The text to find matches in. - - mode: The matching mode to use. This must default to `Regex_Mode.All` + - input: The text to match the pattern described by `self` against. + match : Text -> Match | Nothing | Type_Error + match self input = + Helpers.expect_text input <| + it = Match_Iterator.new self input + case it.next of + Match_Iterator_Value.Next _ match _ -> match + Match_Iterator_Value.Last _ -> Nothing - This method will _always_ return `Nothing` if it fails to find any - matches. + ## Tries to match the provided `input` against the pattern `self`. - ? Return Type - When asked to match in a mode that can only provide a single match, the - return type is either a single `Match` object. When asked to match in a - mode that permits multiple matches, it will always return a `Vector`, - even if only a single match is found. - find : Text -> (Regex_Mode | Matching_Mode) -> Text | Vector Text | Nothing - find self _ _ = Unimplemented.throw "This is an interface only." + Returns a `Vector Match` object, each containing the matched text + and its match groups. - ## PRIVATE + Arguments: + - input: The text to match the pattern described by `self` against. + match_all : Text -> Vector Match ! Type_Error + match_all self input = + Helpers.expect_text input <| + pattern_is_empty = self.internal_regex_object.pattern == '' + if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else + builder = Vector.new_builder + it = Match_Iterator.new self input + go it = case it.next of + Match_Iterator_Value.Next _ match next_it -> + builder.append match + @Tail_Call go next_it + Match_Iterator_Value.Last _ -> Nothing + go it + builder.to_vector + + ## Tries to match the provided `input` against the pattern `self`. + + Returns a `Text` containing the matched text, or `Nothing` if the match + failed. + + Arguments: + - input: The text to match the pattern described by `self` against. + find : Text -> Text | Nothing | Type_Error + find self input = + Helpers.expect_text input <| + match_to_group_maybe <| self.match input + + ## Tries to match the provided `input` against the pattern `self`. + + Returns a `Vector Text`, each containing the matched text. + If the pattern does not match, an empty `Vector` is returned. + + Arguments: + - input: The text to match the pattern described by `self` against. + find_all : Text -> Vector Text | Type_Error + find_all self input = + Helpers.expect_text input <| + self.match_all input . map match_to_group_maybe + + ## ADVANCED Splits the `input` text based on the pattern described by `self`. Arguments: - input: The text to split based on the pattern described by `self`. - - mode: The splitting mode to use. This must default to `Regex_Mode.All`. + - only_first: If true, only split at the first occurrence. This method will _always_ return a vector. If no splits take place, the - vector will contain a single element. - split : Text -> (Matching_Mode | Integer | Regex_Mode) -> Vector Text - split self _ _ = Unimplemented.throw "This is an interface only." + vector will contain a single element (equal to the original string). - ## PRIVATE + > Example + Split on the first instance of the pattern. + pattern = Regex.compile "cd" + input = "abcdefcdghij" + texts = pattern.split input only_first=True + texts . should_equal ["ab", "efcdghij"] + + > Example + Split on the all instances of the pattern in the input. + pattern = Regex.compile "a" + input = "bacadaeaf" + texts = pattern.split input + texts . should_equal ["b", "c", "d", "e", "f"] + + > Example + Returns the original text if there are no matches. + pattern = Regex.compile "aa" + input = "abcdefghij" + texts = pattern.split input + texts . should_equal ["abcdefghij"] + split : Text -> Boolean -> Vector Text | Type_Error + split self input only_first=False = + Helpers.expect_text input <| + builder = Vector.new_builder + it = Match_Iterator.new self input + go next = case next of + Match_Iterator_Value.Next filler _ next_it -> + builder.append filler.text + next = if only_first then next_it.early_exit else next_it.next + @Tail_Call go next + Match_Iterator_Value.Last filler -> + builder.append filler.text + go it.next + builder.to_vector + + ## ADVANCED + + Takes an input string and returns all the matches as a `Vector Text`. + If the pattern contains marked groups, the values are concatenated + together; otherwise the whole match is returned. Non-participating + groups are omitted. + + Arguments: + - input: The text to tokenize. + + > Example + Split to blocks of 3 characters. + + Regex.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF'] + + > Example + Split to blocks of 3 characters taking first and third letters. + + Regex.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF'] + + > Example + Split a text on any white space. + + Regex.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' + == ['Hello','Big','Wide','World','Goodbye!'] + tokenize : Text -> Vector Text + tokenize self input = + self.match_all input . map (build_tokenization_output_from_match self _) + + ## ADVANCED Replace all occurrences of the pattern described by `self` in the `input` with the specified `replacement`. @@ -80,10 +185,250 @@ type Pattern Arguments: - input: The text in which to perform the replacement(s). - replacement: The literal text with which to replace any matches. - - mode: The matching mode to use for finding candidates to replace. This - must default to `Regex_Mode.All`. + - only_first: If True, only replace the first match. If this method performs no replacements it will return the `input` text unchanged. - replace : Text -> Text -> Regex_Mode | Matching_Mode | Integer -> Text - replace self _ _ _ = Unimplemented.throw "This is an interface only." + + The replacement string can contain references to groups matched by the + regex. The following syntaxes are supported: + $0: the entire match string + $&: the entire match string + $n: the nth group + $: Named group `foo` + + > Example + Replace letters in the text "aa". + + pattern = Regex.compile 'aa' + pattern.replace 'aaa' 'b' == 'ba' + + > Example + Replace all occurrences of letters 'l' and 'o' with '#'. + + pattern = Regex.compile '[lo]' + pattern.replace 'Hello World!' '#' == 'He### W#r#d!' + + > Example + Replace the first occurrence of letter 'l' with '#'. + + pattern = Regex.compile 'l' + pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!' + + > Example + Replace texts in quotes with parentheses. + + pattern = Regex.compile '"(.*?)"' + pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz' + + > Example + Replace a literal string with a replacement value. + + pattern = Regex.compile "aa" + input = "aa ab aa ac ad aa aa ax" + match = pattern.replace input "xyz" + match == "xyz ab xyz ac ad xyz xyz ax" + + > Example + Replace each word with the same word surrounded by `[]`. + + pattern = Regex.compile "([a-z]+)" + pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]" + replace : Text -> Text -> Boolean -> Text | Type_Error + replace self input replacement only_first=False = + Helpers.expect_text input <| + it = Match_Iterator.new self input + case it of + Match_Iterator_Value.Last filler -> filler.text + _ -> + replacer = Replacer.new replacement self + + replacer.if_not_error <| + go next current = case next of + Match_Iterator_Value.Next filler match next_it -> + new_value = current + filler.text + (replacer.replace match) + next = if only_first then next_it.early_exit else next_it.next + @Tail_Call go next new_value + Match_Iterator_Value.Last filler -> + current + filler.text + go it.next "" + + ## PRIVATE + + Look up a match group name or number, and check that it is valid. + + Arguments: + - id: The name or number of the group that was asked for. + + Returns: a group number. + + A group number is invalid if it is outside the range of groups + that were in the original pattern. + + A group name is invalid if it was not defined in the original pattern. + + A group name is an alias for a group number; if a name is passed to + this method, it returns the corresponding group number. + + If a group number is passed to `lookup_group` and it is valid, it will + simply return the group number. + + Note that it is possible for a group to "not participate in the match", + for example with a disjunction. In the example below, the "(d)" group + does not participate -- it neither matches nor fails. + + "ab((c)|(d))".find "abc" + + In this case, the group id for "(d)", which is 3, is a valid group id and + (Pattern.lookup_group 3) will return 3. If the caller tries to get group 3, + Match.group will return Nothing. + + lookup_group : Integer | Text -> Integer ! No_Such_Group + lookup_group self id = + case id of + n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of + True -> n + False -> Error.throw (No_Such_Group.Error n) + name : Text -> + # Maps name to number + groups = self.internal_regex_object.groups + + n = case groups of + # If Nothing, there are no named groups + Nothing -> Error.throw (No_Such_Group.Error name) + _ -> + qq = (read_group_map groups name) + case qq of + Nothing -> Nothing + n : Integer -> n + case n of + _ : Integer -> n + Nothing -> Error.throw (No_Such_Group.Error name) + + ## PRIVATE + + Return a lazy iterator over matches against a string. + + Arguments + - text: the string to match against. + iterator : Text -> Match_Iterator + iterator self input = Match_Iterator.new self input + + ## Return the number of groups in the underlying RegexObject. + Note, the count includes group 0 (the whole match) as well. + group_count : Integer + group_count self = self.internal_regex_object.groupCount + + ## Return a vector of all named group names. + group_names : Map Text Integer + group_names self = + map = polyglot_map_to_map self.internal_regex_object.groups + map.keys + +## PRIVATE + + Performs the regex match, and iterates through the results. Yields both + the matched parts of the string, and the 'filler' parts between them. + + The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because + matches and replacement boundaries can fall in the middle of multi- + character graphemes, thereby splitting them apart. + + At each step, it yields a Match_Iterator_Value, whivch has either a filler + and a match, or just the final filler. A Match_Iterator_Value.Last value is + return at the end, and only at the end. + + Optionally, you can call `early_exit` to have it return the remainder of + the string, unmatched, as a single Last value. (Used for `replace` with + `only_first=True`.) +type Match_Iterator + new : Pattern -> Text -> Match_Iterator + new pattern input = Match_Iterator.Value pattern input 0 + + Value (pattern : Pattern) (input : Text) (cursor : Integer) + + ## Return the next match, or the last filler string if there is no + additional match. + + Also returns the next iterator, if there was a match. + next : Match_Iterator_Value + next self = + regex_result = self.pattern.internal_regex_object.exec self.input self.cursor + case regex_result.isMatch of + False -> + filler_range = Range.new self.cursor (Text_Utils.char_length self.input) + filler_span = (Utf_16_Span.Value filler_range self.input) + Match_Iterator_Value.Last filler_span + True -> + match_start = regex_result.getStart 0 + filler_range = Range.new self.cursor match_start + filler_span = (Utf_16_Span.Value filler_range self.input) + match = Match.Value self.pattern regex_result self.input + next_cursor = match.utf_16_end 0 + next_iterator = Match_Iterator.Value self.pattern self.input next_cursor + Match_Iterator_Value.Next filler_span match next_iterator + + ## Returns the remainder of the string, unmatched. + early_exit : Match_Iterator_Value + early_exit self = + filler_range = Range.new self.cursor (Text_Utils.char_length self.input) + filler_span = Utf_16_Span.Value filler_range self.input + Match_Iterator_Value.Last filler_span + + to_text_debug : Vector Text + to_text_debug self = + vb = Vector.new_builder + go it = case it.next of + Match_Iterator_Value.Next filler match next_it -> + vb.append ('\"' + filler.text + '\"') + vb.append ("/" + (match.span 0).text + "/") + go next_it + Match_Iterator_Value.Last filler -> + vb.append ('\"' + filler.text + '\"') + go self + vb.to_vector + +## PRIVATE +type Match_Iterator_Value + Next (filler : Span) (match : Match) (next_iterator : Match_Iterator) + Last (filler : Span) + +## PRIVATE + Convert the polyglot map to a Map. +polyglot_map_to_map : Any -> Map Any Any +polyglot_map_to_map map = + polyglot_keys = Polyglot.get_members map + keys = Vector.from_polyglot_array polyglot_keys + pairs = keys.map key-> [key, Polyglot.get_member map key] + Map.from_vector pairs + +## PRIVATE + Get the named group from the polyglot map. +read_group_map : Any -> Text -> Integer | Nothing +read_group_map polyglot_map name = + map = polyglot_map_to_map polyglot_map + map.get name + +## PRIVATE +match_to_group_maybe : Match | Nothing -> Text | Nothing +match_to_group_maybe match = + if match.is_nothing then Nothing else match.text 0 + +## PRIVATE + Build an output string from a Match resulting from `tokenize`. + See `tokenize`. +build_tokenization_output_from_match : Pattern -> Match -> Text +build_tokenization_output_from_match pattern match = + if pattern.group_count == 1 then match.text 0 else + # Extract the ranges of the spans of all capturing groups + group_numbers = 1.up_to pattern.group_count + spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing + ranges = spans.map span-> case span of Span.Value range _ -> range + + # Eliminate nested capturing groups by sorting and merging the ranges. + top_level_ranges = sort_and_merge_ranges ranges + + # Reconstruct `Spans` from the synthesized `Ranges`, and concatenate. + text_all = case spans.at 0 of Span.Value _ text -> text + top_level_spans = top_level_ranges.map range-> Span.Value range text_all + top_level_spans.map (.text) . join diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso deleted file mode 100644 index ba61982b6a..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso +++ /dev/null @@ -1,425 +0,0 @@ -import project.Any.Any -import project.Data.Filter_Condition.Filter_Condition -import project.Data.Map.Map -import project.Data.Numbers.Integer -import project.Data.Range.Extensions -import project.Data.Range.Range -import project.Data.Text.Span.Span -import project.Data.Text.Span.Utf_16_Span -import project.Data.Text.Regex.Match_2.Match_2 -import project.Data.Text.Regex.Replacer.Replacer -import project.Data.Text.Regex_2.No_Such_Group -import project.Data.Text.Text -import project.Data.Vector.Vector -import project.Error.Error -import project.Errors.Illegal_Argument.Illegal_Argument -import project.Meta -import project.Nothing.Nothing -import project.Polyglot.Polyglot - -from project.Data.Boolean import Boolean, True, False -from project.Data.Index_Sub_Range import sort_and_merge_ranges - -polyglot java import org.enso.base.Replacer_Cache -polyglot java import org.enso.base.Text_Utils - -type Pattern_2 - ## internal_regex_object : RegexObject (Truffle) - (See https://github.com/oracle/graal/blob/master/regex/docs/README.md) - Value (internal_regex_object : Any) - - ## Returns `True` if the input matches against the pattern described by - `self`, otherwise `False`. - - Arguments: - - input: The text to check for matching. - matches : Text -> Boolean - matches self input = - m = self.internal_regex_object.exec input 0 - m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length - - ## Tries to match the provided `input` against the pattern `self`. - - Returns a `Match_2` containing the matched text and its match groups, or - `Nothing` if the match failed. - - Arguments: - - input: The text to match the pattern described by `self` against. - match : Text -> Match_2 | Nothing - match self input = - it = Match_Iterator.new self input - case it.next of - Match_Iterator_Value.Next _ match _ -> match - Match_Iterator_Value.Last _ -> Nothing - - ## Tries to match the provided `input` against the pattern `self`. - - Returns a `Vector Match_2` object, each containing the matched text - and its match groups. - - Arguments: - - input: The text to match the pattern described by `self` against. - match_all : Text -> Vector Match_2 ! Illegal_Argument - match_all self input = - pattern_is_empty = self.internal_regex_object.pattern == '' - if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else - builder = Vector.new_builder - it = Match_Iterator.new self input - go it = case it.next of - Match_Iterator_Value.Next _ match next_it -> - builder.append match - @Tail_Call go next_it - Match_Iterator_Value.Last _ -> Nothing - go it - builder.to_vector - - ## Tries to match the provided `input` against the pattern `self`. - - Returns a `Text` containing the matched text, or `Nothing` if the match - failed. - - Arguments: - - input: The text to match the pattern described by `self` against. - find : Text -> Text | Nothing - find self input = - match_to_group_maybe <| self.match input - - ## Tries to match the provided `input` against the pattern `self`. - - Returns a `Vector Text`, each containing the matched text. - If the pattern does not match, an empty `Vector` is returned. - - Arguments: - - input: The text to match the pattern described by `self` against. - find_all : Text -> Vector Text - find_all self input = - self.match_all input . map match_to_group_maybe - - ## ADVANCED - - Splits the `input` text based on the pattern described by `self`. - - Arguments: - - input: The text to split based on the pattern described by `self`. - - only_first: If True, only split at the first occurrence. - - This method will _always_ return a vector. If no splits take place, the - vector will contain a single element (equal to the original string). - - > Example - Split on the first instance of the pattern. - pattern = Regex_2.compile "cd" - input = "abcdefcdghij" - texts = pattern.split input only_first=True - texts . should_equal ["ab", "efcdghij"] - - > Example - Split on the all instances of the pattern in the input. - pattern = Regex_2.compile "a" - input = "bacadaeaf" - texts = pattern.split input - texts . should_equal ["b", "c", "d", "e", "f"] - - > Example - Returns the original text if there are no matches. - pattern = Regex_2.compile "aa" - input = "abcdefghij" - texts = pattern.split input - texts . should_equal ["abcdefghij"] - split : Text -> Boolean -> Vector Text - split self input only_first=False = - builder = Vector.new_builder - it = Match_Iterator.new self input - go next = case next of - Match_Iterator_Value.Next filler _ next_it -> - builder.append filler.text - next = if only_first then next_it.early_exit else next_it.next - @Tail_Call go next - Match_Iterator_Value.Last filler -> - builder.append filler.text - go it.next - builder.to_vector - - ## ADVANCED - - Takes an input string and returns all the matches as a `Vector Text`. - If the pattern contains marked groups, the values are concatenated - together; otherwise the whole match is returned. Non-participating - groups are omitted. - - Arguments: - - input: The text to tokenize. - - > Example - Split to blocks of 3 characters. - - Regex_2.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF'] - - > Example - Split to blocks of 3 characters taking first and third letters. - - Regex_2.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF'] - - > Example - Split a text on any white space. - - Regex_2.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' - == ['Hello','Big','Wide','World','Goodbye!'] - tokenize : Text -> Vector Text - tokenize self input = - self.match_all input . map (build_tokenization_output_from_match self _) - - ## ADVANCED - - Replace all occurrences of the pattern described by `self` in the `input` - with the specified `replacement`. - - Arguments: - - input: The text in which to perform the replacement(s). - - replacement: The literal text with which to replace any matches. - - only_first: If True, only replace the first match. - - If this method performs no replacements it will return the `input` text - unchanged. - - The replacement string can contain references to groups matched by the - regex. The following syntaxes are supported: - $0: the entire match string - $&: the entire match string - $n: the nth group - $: Named group `foo` - - > Example - Replace letters in the text "aa". - - pattern = Regex_2.compile 'aa' - pattern.replace 'aaa' 'b' == 'ba' - - > Example - Replace all occurrences of letters 'l' and 'o' with '#'. - - pattern = Regex_2.compile '[lo]' - pattern.replace 'Hello World!' '#' == 'He### W#r#d!' - - > Example - Replace the first occurrence of letter 'l' with '#'. - - pattern = Regex_2.compile 'l' - pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!' - - > Example - Replace texts in quotes with parentheses. - - pattern = Regex_2.compile '"(.*?)"' - pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz' - - > Example - Replace a literal string with a replacement value. - - pattern = Regex_2.compile "aa" - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "xyz" - match == "xyz ab xyz ac ad xyz xyz ax" - - > Example - Replace each word with the same word surrounded by `[]`. - - pattern = Regex_2.compile "([a-z]+)" - pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]" - replace : Text -> Text -> Boolean -> Text - replace self input replacement only_first=False = - it = Match_Iterator.new self input - case it of - Match_Iterator_Value.Last filler -> filler.text - _ -> - replacer = Replacer.new replacement self - - replacer.if_not_error <| - go next current = case next of - Match_Iterator_Value.Next filler match next_it -> - new_value = current + filler.text + (replacer.replace match) - next = if only_first then next_it.early_exit else next_it.next - @Tail_Call go next new_value - Match_Iterator_Value.Last filler -> - current + filler.text - go it.next "" - - ## PRIVATE - - Look up a match group name or number, and check that it is valid. - - Arguments: - - id: The name or number of the group that was asked for. - - Returns: a group number. - - A group number is invalid if it is outside the range of groups - that were in the original pattern. - - A group name is invalid if it was not defined in the original pattern. - - A group name is an alias for a group number; if a name is passed to - this method, it returns the corresponding group number. - - If a group number is passed to `lookup_group` and it is valid, it will - simply return the group number. - - Note that it is possible for a group to "not participate in the match", - for example with a disjunction. In the example below, the "(d)" group - does not participate -- it neither matches nor fails. - - "ab((c)|(d))".find "abc" - - In this case, the group id for "(d)", which is 3, is a valid group id and - (Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3, - Match_2.group will return Nothing. - - lookup_group : Integer | Text -> Integer ! No_Such_Group - lookup_group self id = - case id of - n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of - True -> n - False -> Error.throw (No_Such_Group.Error n) - name : Text -> - # Maps name to number - groups = self.internal_regex_object.groups - - n = case groups of - # If Nothing, there are no named groups - Nothing -> Error.throw (No_Such_Group.Error name) - _ -> - qq = (read_group_map groups name) - case qq of - Nothing -> Nothing - n : Integer -> n - case n of - _ : Integer -> n - Nothing -> Error.throw (No_Such_Group.Error name) - - ## PRIVATE - - Return a lazy iterator over matches against a string. - - Arguments - - text: the string to match against. - iterator : Text -> Match_Iterator - iterator self input = Match_Iterator.new self input - - ## Return the number of groups in the underlying RegexObject. - Note, the count includes group 0 (the whole match) as well. - group_count : Integer - group_count self = self.internal_regex_object.groupCount - - ## Return a vector of all named group names. - group_names : Map Text Integer - group_names self = - map = polyglot_map_to_map self.internal_regex_object.groups - map.keys - -## PRIVATE - - Performs the regex match, and iterates through the results. Yields both - the matched parts of the string, and the 'filler' parts between them. - - The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because - matches and replacement boundaries can fall in the middle of multi- - character graphemes, thereby splitting them apart. - - At each step, it yields a Match_Iterator_Value, whivch has either a filler - and a match, or just the final filler. A Match_Iterator_Value.Last value is - return at the end, and only at the end. - - Optionally, you can call `early_exit` to have it return the remainder of - the string, unmatched, as a single Last value. (Used for `replace` with - `only_first=True`.) -type Match_Iterator - new : Pattern_2 -> Text -> Match_Iterator - new pattern input = Match_Iterator.Value pattern input 0 - - Value (pattern : Pattern_2) (input : Text) (cursor : Integer) - - ## Return the next match, or the last filler string if there is no - additional match. - - Also returns the next iterator, if there was a match. - next : Match_Iterator_Value - next self = - regex_result = self.pattern.internal_regex_object.exec self.input self.cursor - case regex_result.isMatch of - False -> - filler_range = Range.new self.cursor (Text_Utils.char_length self.input) - filler_span = (Utf_16_Span.Value filler_range self.input) - Match_Iterator_Value.Last filler_span - True -> - match_start = regex_result.getStart 0 - filler_range = Range.new self.cursor match_start - filler_span = (Utf_16_Span.Value filler_range self.input) - match = Match_2.Value self.pattern regex_result self.input - next_cursor = match.utf_16_end 0 - next_iterator = Match_Iterator.Value self.pattern self.input next_cursor - Match_Iterator_Value.Next filler_span match next_iterator - - ## Returns the remainder of the string, unmatched. - early_exit : Match_Iterator_Value - early_exit self = - filler_range = Range.new self.cursor (Text_Utils.char_length self.input) - filler_span = Utf_16_Span.Value filler_range self.input - Match_Iterator_Value.Last filler_span - - to_text_debug : Vector Text - to_text_debug self = - vb = Vector.new_builder - go it = case it.next of - Match_Iterator_Value.Next filler match next_it -> - vb.append ('\"' + filler.text + '\"') - vb.append ("/" + (match.span 0).text + "/") - go next_it - Match_Iterator_Value.Last filler -> - vb.append ('\"' + filler.text + '\"') - go self - vb.to_vector - -## PRIVATE -type Match_Iterator_Value - Next (filler : Span) (match : Match_2) (next_iterator : Match_Iterator) - Last (filler : Span) - -## PRIVATE - Convert the polyglot map to a Map. -polyglot_map_to_map : Any -> Map Any Any -polyglot_map_to_map map = - polyglot_keys = Polyglot.get_members map - keys = Vector.from_polyglot_array polyglot_keys - pairs = keys.map key-> [key, Polyglot.get_member map key] - Map.from_vector pairs - -## PRIVATE - Get the named group from the polyglot map. -read_group_map : Any -> Text -> Integer | Nothing -read_group_map polyglot_map name = - map = polyglot_map_to_map polyglot_map - map.get name - -## PRIVATE -match_to_group_maybe : Match_2 | Nothing -> Text | Nothing -match_to_group_maybe match = - if match.is_nothing then Nothing else match.text 0 - -## PRIVATE - Build an output string from a Match_2 resulting from `tokenize`. - See `tokenize`. -build_tokenization_output_from_match : Pattern_2 -> Match_2 -> Text -build_tokenization_output_from_match pattern match = - if pattern.group_count == 1 then match.text 0 else - # Extract the ranges of the spans of all capturing groups - group_numbers = 1.up_to pattern.group_count - spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing - ranges = spans.map span-> case span of Span.Value range _ -> range - - # Eliminate nested capturing groups by sorting and merging the ranges. - top_level_ranges = sort_and_merge_ranges ranges - - # Reconstruct `Spans` from the synthesized `Ranges`, and concatenate. - text_all = case spans.at 0 of Span.Value _ text -> text - top_level_spans = top_level_ranges.map range-> Span.Value range text_all - top_level_spans.map (.text) . join diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Regex_Mode.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Regex_Mode.enso deleted file mode 100644 index 47f59916d4..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Regex_Mode.enso +++ /dev/null @@ -1,28 +0,0 @@ -## A description of how the regex engine will match on the content. - - This lets you configure how you want to match, from the `First` match only, - to matching on the `Full` content of the input text. - -import project.Data.Numbers.Integer -import project.Data.Text.Matching_Mode.Matching_Mode - -type Regex_Mode - ## The regex will make all possible matches. - All - - ## The regex will only match if the _entire_ text matches. - Full - - ## The regex will only match within the region defined by start..end. - - Arguments: - - start: The inclusive start bound of the region. - - end: The exclusive end bound of the region. - - mode: The mode to use within the bounded region. - - ! Units - The `start` and `end` indices range over _characters_ in the text. The - precise definition of `character` is, for the moment, defined by the - regular expression engine itself. - Bounded (start : Integer) (end : Integer) (mode : (Matching_Mode.First | Matching_Mode.Last | Regex_Mode) = Regex_Mode.All) - diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Regex_Option.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Regex_Option.enso deleted file mode 100644 index e005fbbe18..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Regex_Option.enso +++ /dev/null @@ -1,44 +0,0 @@ -## Options are used to configure how a regex engine behaves. - - In this file, Enso provides a set of standard options that must be supported - by all regex engines integrated with Enso. - -type Regex_Option - - ## Specifies that all predefined character classes and POSIX character - classes will match _only_ on ASCII characters. - - ! Performance - If you are _sure_ that your data can only ever contain characters from - the ASCII character set, you may be able to obtain a performance boost - by specifying this flag. This may not be the case on all engines or all - regexes. - Ascii_Matching - - ## Specifies that matching should be performed in a case-insensitive manner. - Case_Insensitive - - ## Specifies that the regular expression should be interpreted in comments - mode. - - Comments mode has the following changes: - - Whitespace within the pattern is ignored, except when within a - character class or when preceded by an unescaped backslash, or within - grouping constructs (e.g. `(?...)`). - - When a line contains a `#`, that is not in a character class and is not - preceded by an unescaped backslash, all characters from the leftmost - such `#` to the end of the line are ignored. That is to say, they act - as _comments_ in the regex. - Comments - - ## Specifies that the `.` special character should match everything - _including_ newline characters. Without this flag, it will match all - characters _except_ newlines. - Dot_Matches_Newline - - ## Specifies that the pattern character `^` matches at both the beginning of - the string and at the beginning of each line (immediately following a - newline), and that the pattern character `$` matches at the end of each - line _and_ at the end of the string. - Multiline - diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso index d840c654b2..7a31ceaa00 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Replacer.enso @@ -1,10 +1,10 @@ import project.Data.Numbers.Integer import project.Data.Text.Extensions -import project.Data.Text.Regex.Match_2.Match_2 -import project.Data.Text.Regex.Pattern_2.Match_Iterator_Value -import project.Data.Text.Regex.Pattern_2.Pattern_2 -import project.Data.Text.Regex_2 -import project.Data.Text.Regex_2.No_Such_Group +import project.Data.Text.Regex +import project.Data.Text.Regex.Match.Match +import project.Data.Text.Regex.No_Such_Group +import project.Data.Text.Regex.Pattern.Match_Iterator_Value +import project.Data.Text.Regex.Pattern.Pattern import project.Data.Text.Span.Utf_16_Span import project.Data.Text.Text import project.Data.Vector.Vector @@ -23,7 +23,7 @@ type Replacer Implements a replacement for a regular expression. - Pattern_2.replace uses a Replacer to replace each regex match with + Pattern.replace uses a Replacer to replace each regex match with a replacement string. This string can contain references to match groups from the original regex. @@ -40,7 +40,7 @@ type Replacer Arguments - replacement_string: a string, possibly containing group references, that will be used to provide a replacement in a regex match. - new : Text -> Pattern_2 -> Replacer ! No_Such_Group + new : Text -> Pattern -> Replacer ! No_Such_Group new replacement_string pattern = Replacer.Value (build_replacement_vector_cached replacement_string pattern) @@ -48,7 +48,7 @@ type Replacer Arguments: - match: the match from the original string that is to be replaced. - replace : Match_2 -> Text + replace : Match -> Text replace self match = string_builder = StringBuilder.new self.replacement.each replacement-> @@ -82,7 +82,7 @@ group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))" Uses Replacement_Cache to avoid rebuilding the vector for recently used replacement strings. -build_replacement_vector_cached : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group +build_replacement_vector_cached : Text -> Pattern -> Vector Replacement ! No_Such_Group build_replacement_vector_cached replacement_string pattern = Replacer_Cache.get_or_set replacement_string _-> build_replacement_vector replacement_string pattern @@ -93,9 +93,9 @@ build_replacement_vector_cached replacement_string pattern = Parse the replacement string into an alternating series of literal strings and group reference numbers. -build_replacement_vector : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group +build_replacement_vector : Text -> Pattern -> Vector Replacement ! No_Such_Group build_replacement_vector replacement_string pattern = - replacement_pattern = Regex_2.compile group_reference_regex + replacement_pattern = Regex.compile group_reference_regex it = replacement_pattern.iterator replacement_string builder = Vector.new_builder @@ -117,14 +117,14 @@ build_replacement_vector replacement_string pattern = Parse a capture group reference. Arguments: - - pattern: the Pattern_2 used to initiate the replacement. This is used + - pattern: the Pattern used to initiate the replacement. This is used to identify and validate capture groups. - match: the match of the replacement string against group_reference_regex. Returns a Replacement: a group number, or, in the case of `$$`, a literal. See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions -parse_group_number : Pattern_2 -> Match_2 -> Replacement ! No_Such_Group +parse_group_number : Pattern -> Match -> Replacement ! No_Such_Group parse_group_number pattern match = case match.text.take 2 of "$$" -> Replacement.Literal "$" "$<" -> diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso deleted file mode 100644 index 9a02dc9a4c..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso +++ /dev/null @@ -1,86 +0,0 @@ -import project.Any.Any -import project.Data.Numbers.Integer -import project.Data.Text.Prim_Text_Helper -import project.Data.Text.Regex.Pattern_2.Pattern_2 -import project.Data.Text.Text -import project.Error.Error -import project.Errors.Illegal_Argument.Illegal_Argument -import project.Nothing.Nothing -import project.Panic.Panic - -from project.Data.Boolean import Boolean, True, False -from project.Errors.Common import Syntax_Error - -polyglot java import java.util.regex.Pattern as Java_Pattern - -## Compile the provided `expression` into a regex pattern that can be used for - matching. - - Arguments - - expression: The text representing the regular expression that you want to - compile. Must be non-empty. - - case_insensitive: Enables or disables case-insensitive matching. Case - insensitive matching behaves as if it normalises the case of all input - text before matching on it. - - If an empty regex is used, `compile` throws an Illegal_Argument error. - - ? Why Compile? - While many regex engines are able to cache ad-hoc patterns, it is often - useful to be able to manually retain a pattern that you have computed. This - function exists so you can hold onto the resultant `Pattern_2` object, - instead of immediately proceeding to match using it. -compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error | Illegal_Argument -compile self expression case_insensitive=Nothing = - if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else - options_string = if case_insensitive == True then "usgi" else "usg" - - internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic-> - Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message)) - - Pattern_2.Value internal_regex_object - -## ADVANCED - - Escape the special characters in `expression` such that the result is a - valid literal pattern for the original string. - - Arguments: - - expression: The expression to escape metacharacters in. - - > Example - Turn a Text into a regex that matches that string exactly. - - import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine - import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option - - example_escape = - literal_string = "\!\.|abcde" - engine = Default_Engine.new - engine.escape literal_string -escape : Text -> Text -escape self expression = Java_Pattern.quote expression - -## An error that is emitted when there is no such group in the match for the - provided `id`. - - Arguments: - - id: The identifier of the group that was asked for but does not exist. -type No_Such_Group - Error (id : Text | Integer) - - ## PRIVATE - - Provides a human-readable representation of the `No_Such_Group`. - to_display_text : Text - to_display_text self = case self.id of - _ : Integer -> "No group exists with the index " + self.id.to_text + "." - _ : Text -> "No group exists with the name " + self.id + "." - -## A syntax error reported by the Truffle regex compiler. -type Regex_Syntax_Error - ## PRIVATE - - Arguments: - - message: A description of the erroneous syntax. - Error message diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_Matcher.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_Matcher.enso deleted file mode 100644 index 0c4d1486bf..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_Matcher.enso +++ /dev/null @@ -1,112 +0,0 @@ -import project.Any.Any -import project.Data.Text.Case_Sensitivity.Case_Sensitivity -import project.Data.Text.Matching -import project.Data.Text.Regex -import project.Data.Text.Regex.Pattern.Pattern -import project.Data.Text.Text -import project.Data.Vector.Vector -import project.Errors.Problem_Behavior.Problem_Behavior - -from project.Data.Boolean import Boolean, True, False - -## Represents regex matching mode. -type Regex_Matcher - ## Regex matching mode. - - Arguments: - - case_sensitivity: Specifies whether the matching should be case - sensitive. - - multiline: Enables or disables the multiline option. Multiline - specifies that the `^` and `$` pattern characters match the start and - end of lines, as to well as the start and end of the input, - respectively. - - match_ascii: Enables or disables pure-ASCII matching for the regex. If - you know your data only contains ASCII, you can enable this for a - performance boost on some regex engines. - - dot_matches_newline: Enables or disables the dot matches newline - option. This specifies that the `.` special character should match - everything _including_ newline characters. Without this flag, it - matches all characters _except_ newlines. - - comments: Enables or disables the comments mode for the regular - expression. In comments mode, the following changes apply: - - Whitespace within the pattern is ignored, except when within a - character class or when preceded by an unescaped backslash, or within - grouping constructs (e.g. `(?...)`). - - When a line contains a `#` that is not in a character class and is - not preceded by an unescaped backslash, all characters from the - leftmost such `#` to the end of the line are ignored. That is to say; - they act as 'comments' in the regex. - Value (case_sensitivity : Case_Sensitivity = Case_Sensitivity.Sensitive) (multiline : Boolean = False) (match_ascii : Boolean = False) (dot_matches_newline : Boolean = False) (comments : Boolean = False) - - ## UNSTABLE - Compiles a provided pattern according to the rules defined in this - `Regex_Matcher`. - compile : Text -> Pattern - compile self pattern = - case_insensitive = case self.case_sensitivity of - Case_Sensitivity.Default -> False - Case_Sensitivity.Sensitive -> False - ## TODO [RW] Currently locale is not supported in case-insensitive - Regex matching. There are plans to revisit it: - https://www.pivotaltracker.com/story/show/181313576 - Case_Sensitivity.Insensitive _ -> True - Regex.compile pattern case_insensitive=case_insensitive match_ascii=self.match_ascii dot_matches_newline=self.dot_matches_newline multiline=self.multiline comments=self.comments - - ## UNSTABLE - Checks if a name matches the provided criterion according to the specified - matching strategy. - - Arguments: - - name: A `Text` representing the name being matched. - - criterion: A `Text` representing the regular expression specifying the - matching criterion. - - > Example - Check if the provided name matches a regular expression. - - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) . match_single_criterion "Foobar" "f.*" == True - match_single_criterion : Text -> Text -> Boolean - match_single_criterion self name criterion = - self.compile criterion . matches name - - ## UNSTABLE - Selects objects from an input list that match any of the provided criteria. - - Arguments: - - objects: A list of objects to be matched. - - criteria: A list of texts representing the matching criteria. Their meaning - depends on the matching strategy. - - reorder: Specifies whether to reorder the matched objects according to the - order of the matching criteria. - If `False`, the matched entries are returned in the same order as in the - input. - If `True`, the matched entries are returned in the order of the criteria - matching them. If a single object has been matched by multiple criteria, it - is placed in the group belonging to the first matching criterion on the - list. - If a single criterion's group has more than one element, their relative - order is the same as in the input. - - name_mapper: A function mapping a provided object to its name, which will - then be matched with the criteria. It is set to the identity function by - default, thus allowing the input to be a list of names to match. But it can - be overridden to enable matching more complex objects. - - matcher: A `Matcher` instance specifying how to interpret the criterion. - - on_problems: Specifies the behavior when a problem occurs during the - function. - By default, a warning is issued, but the operation proceeds. - If set to `Report_Error`, the operation fails with a dataflow error. - If set to `Ignore`, the operation proceeds without errors or warnings. - - > Example - Selects objects matching one of the provided patterns, preserving the input order. - - Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive . match_criteria ["foo", "foobar", "quux", "baz", "Foo"] [".*ba.*", "f.*"] == ["foo", "foobar", "baz"] - - > Example - Selects pairs matching their first element with the provided criteria and - ordering the result according to the order of criteria that matched them. - - Text_Matcher.match_criteria [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10] ["bar", "foo"] reorder=True name_mapper=_.name == [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0] - match_criteria : Vector Any -> Vector Text -> Boolean -> (Any -> Text) -> Problem_Behavior -> Vector Any ! Matching.No_Matches_Found - match_criteria self objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning = - Matching.match_criteria_implementation self objects criteria reorder name_mapper on_problems diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Matcher.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Matcher.enso deleted file mode 100644 index 234509909e..0000000000 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Text_Matcher.enso +++ /dev/null @@ -1,75 +0,0 @@ -import project.Any.Any -import project.Data.Locale.Locale -import project.Data.Text.Matching -import project.Data.Text.Text -import project.Data.Vector.Vector -import project.Errors.Problem_Behavior.Problem_Behavior - -from project.Data.Boolean import Boolean, True, False - -## Represents exact text matching mode. -type Text_Matcher - ## Represents exact text matching mode. - Case_Sensitive - - ## Represents case-insensitive text matching mode. - Case_Insensitive (locale:Locale=Locale.default) - - ## UNSTABLE - Checks if a name matches the provided criterion according to the specified - matching strategy. - - Arguments: - - name: A `Text` representing the name being matched. - - criterion: A `Text` representing the name to be matched. - - > Example - Check if the provided name matches a regular expression. - - Text_Matcher.match_single_criterion "Foobar" "foo" == False - match_single_criterion : Text -> Text -> Boolean - match_single_criterion self name criterion = case self of - Text_Matcher.Case_Sensitive -> name == criterion - Text_Matcher.Case_Insensitive locale -> name.equals_ignore_case criterion locale=locale - - ## UNSTABLE - Selects objects from an input list that match any of the provided criteria. - - Arguments: - - objects: A list of objects to be matched. - - criteria: A list of texts representing the matching criteria. Their meaning - depends on the matching strategy. - - reorder: Specifies whether to reorder the matched objects according to the - order of the matching criteria. - If `False`, the matched entries are returned in the same order as in the - input. - If `True`, the matched entries are returned in the order of the criteria - matching them. If a single object has been matched by multiple criteria, it - is placed in the group belonging to the first matching criterion on the - list. - If a single criterion's group has more than one element, their relative - order is the same as in the input. - - name_mapper: A function mapping a provided object to its name, which will - then be matched with the criteria. It is set to the identity function by - default, thus allowing the input to be a list of names to match. But it can - be overridden to enable matching more complex objects. - - matcher: A `Matcher` instance specifying how to interpret the criterion. - - on_problems: Specifies the behavior when a problem occurs during the - function. - By default, a warning is issued, but the operation proceeds. - If set to `Report_Error`, the operation fails with a dataflow error. - If set to `Ignore`, the operation proceeds without errors or warnings. - - > Example - Selects objects matching one of the provided patterns, preserving the input order. - - Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive . match_criteria ["foo", "foobar", "quux", "baz", "Foo"] [".*ba.*", "f.*"] == ["foo", "foobar", "baz"] - - > Example - Selects pairs matching their first element with the provided criteria and - ordering the result according to the order of criteria that matched them. - - Text_Matcher.match_criteria [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10] ["bar", "foo"] reorder=True name_mapper=_.name == [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0] - match_criteria : Vector Any -> Vector Text -> Boolean -> (Any -> Text) -> Problem_Behavior -> Vector Any ! Matching.No_Matches_Found - match_criteria self objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning = - Matching.match_criteria_implementation self objects criteria reorder name_mapper on_problems diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso index a883d8ed96..1cf329e5a0 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Main.enso @@ -94,10 +94,6 @@ import project.Data.Text.Line_Ending_Style.Line_Ending_Style import project.Data.Text.Location.Location import project.Data.Text.Matching_Mode.Matching_Mode import project.Data.Text.Regex -import project.Data.Text.Regex.Regex_Mode.Regex_Mode -import project.Data.Text.Regex.Regex_Option.Regex_Option -import project.Data.Text.Regex_Matcher.Regex_Matcher -import project.Data.Text.Text_Matcher.Text_Matcher import project.Data.Text.Text_Ordering.Text_Ordering import project.Data.Text.Text_Sub_Range.Text_Sub_Range import project.Data.Time.Date.Date @@ -146,10 +142,6 @@ export project.Data.Text.Line_Ending_Style.Line_Ending_Style export project.Data.Text.Location.Location export project.Data.Text.Matching_Mode.Matching_Mode export project.Data.Text.Regex -export project.Data.Text.Regex.Regex_Mode.Regex_Mode -export project.Data.Text.Regex.Regex_Option.Regex_Option -export project.Data.Text.Regex_Matcher.Regex_Matcher -export project.Data.Text.Text_Matcher.Text_Matcher export project.Data.Text.Text_Ordering.Text_Ordering export project.Data.Text.Text_Sub_Range.Text_Sub_Range export project.Data.Time.Date.Date diff --git a/distribution/lib/Standard/Examples/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Examples/0.0.0-dev/src/Main.enso index a9fdb72d43..b2447c0236 100644 --- a/distribution/lib/Standard/Examples/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Examples/0.0.0-dev/src/Main.enso @@ -1,12 +1,10 @@ from Standard.Base import all -import Standard.Base.Errors.Common.No_Such_Method +import Standard.Base.Data.Text.Regex.Match.Match +import Standard.Base.Errors.Common.No_Such_Method import Standard.Base.Network.HTTP.Response.Response import Standard.Base.Network.HTTP.Response_Body.Response_Body -import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine -import Standard.Base.Data.Text.Regex.Engine.Default.Match as Default_Engine_Match - from Standard.Table import Table, Column from Standard.Image import Image, Read_Flag, Matrix @@ -269,8 +267,7 @@ transactions_table = (enso_project.data / "food_shop_transactions.csv") . read ## An example regex match. -match : Default_Engine_Match +match : Match match = - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - pattern.match "aa ab abc a bc bcd" mode=Matching_Mode.First + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + pattern.match "aa ab abc a bc bcd" diff --git a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java index 6e73b53449..9d4836d379 100644 --- a/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Regex_Utils.java @@ -6,71 +6,6 @@ import java.util.regex.Pattern; public class Regex_Utils { - /** - * Obtains the names for named groups. - * - *

Assumes that the provided {@link Pattern} is syntactically valid. Behaviour is undefined if - * run on a syntactically invalid pattern. - * - * @param pattern the pattern for which to get the group names - * @return the names for the named groups in {@code pattern} - */ - public static String[] get_group_names(Pattern pattern) { - String pattern_text = pattern.pattern(); - - char[] characters = pattern_text.toCharArray(); - ArrayList names = new ArrayList<>(); - - for (int i = 0; i < pattern_text.length(); ++i) { - char character = characters[i]; - - if (character == '\\') { - ++i; - break; - } - - String header = "(?<"; - - if (pattern_text.startsWith(header, i)) { - i += header.length(); - StringBuilder buffer = new StringBuilder(); - - while (i < pattern_text.length()) { - character = characters[i]; - - if (character == '>') { - break; - } - - ++i; - - buffer.append(character); - } - - names.add(buffer.toString()); - } - } - - return names.toArray(new String[0]); - } - - /** - * Looks for matches of the provided regular expression in the provided text. - * - *

This should behave exactly the same as `Regex.compile regex . find text` in Enso, it is here - * only as a temporary workaround, because the Enso function gives wrong results on examples like - * `Regex.compile "([0-9]+|[^0-9]+)" . find "1a2c"` where it returns `[1, a, 2]` instead of `[1, - * a, 2, c]`. - */ - public static String[] find_all_matches(String regex, String text) { - var allMatches = new ArrayList(); - Matcher m = Pattern.compile(regex).matcher(text); - while (m.find()) { - allMatches.add(m.group()); - } - return allMatches.toArray(new String[0]); - } - /** * Converts a SQL-like pattern into a Regex with the same semantics. * @@ -87,7 +22,7 @@ public class Regex_Utils { // Before inserting the converted wildcard, we append the accumulated characters, quoting // them first. if (acc.length() > 0) { - result.append(Pattern.quote(acc.toString())); + result.append(regexQuote(acc.toString())); acc.setLength(0); } @@ -103,7 +38,7 @@ public class Regex_Utils { // If any trailing characters were left, we append them too. if (acc.length() > 0) { - result.append(Pattern.quote(acc.toString())); + result.append(regexQuote(acc.toString())); } return result.toString(); diff --git a/test/Meta_Test_Suite_Tests/src/Main.enso b/test/Meta_Test_Suite_Tests/src/Main.enso index 8535fa104d..5d50802dbe 100644 --- a/test/Meta_Test_Suite_Tests/src/Main.enso +++ b/test/Meta_Test_Suite_Tests/src/Main.enso @@ -8,7 +8,7 @@ type Setup make_expected_output_regex expected_output = parts = expected_output.split "???" . map Regex.escape - Regex.compile (parts.join ".+") dot_matches_newline=True + Regex.compile (parts.join ".+") spec setup = run_test source_path = diff --git a/test/Tests/src/Data/Range_Spec.enso b/test/Tests/src/Data/Range_Spec.enso index 93f9fc5f6c..582b258508 100644 --- a/test/Tests/src/Data/Range_Spec.enso +++ b/test/Tests/src/Data/Range_Spec.enso @@ -2,6 +2,7 @@ from Standard.Base import all import Standard.Base.Data.Range.Empty_Error import Standard.Base.Errors.Common.Index_Out_Of_Bounds import Standard.Base.Errors.Common.No_Such_Method +import Standard.Base.Errors.Common.Type_Error import Standard.Base.Errors.Common.Unsupported_Argument_Types import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.Errors.Illegal_State.Illegal_State @@ -143,8 +144,8 @@ spec = Test.group "Range" <| range.filter (Filter_Condition.Not_In [7, 3, 2]) . should_equal [1, 4, 5] Test.expect_panic_with (range.filter (Filter_Condition.Starts_With "a")) No_Such_Method - Test.expect_panic_with (range.filter (Filter_Condition.Like "a%")) Unsupported_Argument_Types - Test.expect_panic_with (range.filter (Filter_Condition.Not_Like "a_")) Unsupported_Argument_Types + range.filter (Filter_Condition.Like "a%") . should_fail_with Type_Error + range.filter (Filter_Condition.Not_Like "a_") . should_fail_with Type_Error range.filter Filter_Condition.Is_True . should_equal [] range.filter Filter_Condition.Is_False . should_equal [] range.filter Filter_Condition.Is_Nothing . should_equal [] diff --git a/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso b/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso deleted file mode 100644 index a648924207..0000000000 --- a/test/Tests/src/Data/Text/Default_Regex_Engine_Spec.enso +++ /dev/null @@ -1,613 +0,0 @@ -from Standard.Base import all -import Standard.Base.Data.Text.Span.Utf_16_Span -import Standard.Base.Errors.Common.Syntax_Error - -import Standard.Base.Data.Text.Matching_Mode.Matching_Mode -from Standard.Base.Data.Text.Regex import No_Such_Group, Invalid_Option -import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine -import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option - -polyglot java import java.util.regex.Pattern as Java_Pattern - -from Standard.Test import Test, Test_Suite -import Standard.Test.Extensions - -default_mask = Java_Pattern.CANON_EQ.bit_or Java_Pattern.UNICODE_CASE . bit_or Java_Pattern.UNICODE_CHARACTER_CLASS - -spec = - Test.group "The default regex engine's options handling" <| - - Test.specify "should convert options to Java" <| - options = [Regex_Option.Comments, Regex_Option.Multiline, Default_Engine.Option.Unix_Lines] - expected_mask = Java_Pattern.UNIX_LINES.bit_or Java_Pattern.COMMENTS . bit_or Java_Pattern.MULTILINE . bit_or default_mask - actual_mask = Default_Engine.from_enso_options options - - actual_mask . should_equal expected_mask - - Test.specify "should specify the unicode options by default" <| - actual_mask = Default_Engine.from_enso_options [] - - actual_mask . should_equal default_mask - - Test.specify "should handle ascii matching by disabling unicode" <| - actual_mask = Default_Engine.from_enso_options [Regex_Option.Ascii_Matching] - actual_mask . should_equal 0 - - Test.specify "should result in an error when an option is invalid" <| - Default_Engine.from_enso_options [""] . should_fail_with Invalid_Option - Default_Engine.from_enso_options ["", Regex_Option.Ascii_Matching] . should_fail_with Invalid_Option - - Test.group "The default regex engine (Default_Engine)" <| - - Test.specify "should be able to compile patterns with no options" <| - engine = Default_Engine.new - pattern = engine.compile "^a$" [] - pattern.engine . should_equal engine - pattern.options . should_equal [] - pattern.internal_pattern.flags . should_equal default_mask - - Test.specify "should be able to compile patterns with global options" <| - engine = Default_Engine.new - pattern = engine.compile "^a$" [Regex_Option.Multiline] - pattern.engine . should_equal engine - pattern.options . should_equal [Regex_Option.Multiline] - pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.MULTILINE) - - Test.specify "should be able to compile patterns with engine-specific options" <| - engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern] - pattern = engine.compile "^a$" [] - pattern.engine . should_equal engine - pattern.options . should_equal [Default_Engine.Option.Literal_Pattern] - pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.LITERAL) - - Test.specify "should be able to compile patterns with combined options" <| - engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern] - pattern = engine.compile "^a$" [Regex_Option.Comments] - pattern.engine . should_equal engine - pattern.options.contains Default_Engine.Option.Literal_Pattern . should_be_true - pattern.options.contains Regex_Option.Comments . should_be_true - pattern.internal_pattern.flags . should_equal (default_mask . bit_or Java_Pattern.LITERAL . bit_or Java_Pattern.COMMENTS) - - Test.specify "should return a syntax error of the regex syntax is invalid" <| - engine = Default_Engine.new - engine.compile "^(a" [] . should_fail_with Syntax_Error - - Test.specify "should throw an invalid options error if an option is invalid" <| - engine = Default_Engine.new - engine.compile "^a$" ["invalid"] . should_fail_with Invalid_Option - - Test.specify "should escape an expression for use as a literal" <| - pattern = "http://example.com" - engine = Default_Engine.new - engine.escape pattern . should_equal "\Qhttp://example.com\E" - - Test.group "The default regex engine's Pattern.matches" <| - engine = Default_Engine.new - - Test.specify "should return True when the pattern matches against the input" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - pattern.matches input . should_be_true - - Test.specify "should return False when the pattern doesn't match against the input" <| - pattern = engine.compile "aaz" [] - input = "aa ab abc a bc bcd" - pattern.matches input . should_be_false - - Test.specify "should check for full matches" <| - pattern = engine.compile "f.o" [] - pattern.matches "foo" . should_be_true - pattern.matches "foobar" . should_be_false - - Test.group "The default regex engine's Pattern.match" <| - engine = Default_Engine.new - - Test.specify "should be able to `match` the first instance of the pattern in the input" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value - match.group 0 . should_equal input - - Test.specify "should return `Nothing` if there are no matches in first mode" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "abc" - match = pattern.match input mode=Matching_Mode.First - match . should_equal Nothing - - Test.specify "should be able to `match` at most N instances of the pattern in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.match input mode=3 - match.length . should_equal 3 - match.at 0 . group 0 . should_equal "ab" - match.at 1 . group 0 . should_equal "cd" - match.at 2 . group 0 . should_equal "ef" - - Test.specify "should `match` fewer than N instances when there are fewer than N in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdef" - match = pattern.match input mode=5 - match.length . should_equal 3 - match.at 0 . group 0 . should_equal "ab" - match.at 1 . group 0 . should_equal "cd" - match.at 2 . group 0 . should_equal "ef" - - Test.specify "should return `Nothing` when a counted match fails" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.match input mode=3 - match . should_equal Nothing - - Test.specify "should be able to `match` the all instances of the pattern in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.match input mode=Regex_Mode.All - match.length . should_equal 5 - match.at 0 . group 0 . should_equal "ab" - match.at 1 . group 0 . should_equal "cd" - match.at 2 . group 0 . should_equal "ef" - match.at 3 . group 0 . should_equal "gh" - match.at 4 . group 0 . should_equal "ij" - - Test.specify "should return `Nothing` when an all match match fails" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.match input mode=Regex_Mode.All - match . should_equal Nothing - - Test.specify "should be able to `match` the pattern against the entire input" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Regex_Mode.Full - match . should_be_a Default_Engine.Match.Value - match.group 0 . should_equal input - - Test.specify "should return `Nothing` if a full match does not match the entire input" <| - pattern = engine.compile "(..)" [] - input = "aa ab" - full_match = pattern.match input mode=Regex_Mode.Full - full_match . should_equal Nothing - match = pattern.match input mode=Matching_Mode.First - match . should_be_a Default_Engine.Match.Value - - Test.specify "should be able to `match` the pattern against bounded input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.match input mode=(Regex_Mode.Bounded 2 8) - match.length . should_equal 3 - match.at 0 . group 0 . should_equal "cd" - match.at 1 . group 0 . should_equal "ef" - match.at 2 . group 0 . should_equal "gh" - - Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <| - pattern = engine.compile "" [] - match_1 = pattern.match "" mode=Regex_Mode.All - match_1.length . should_equal 1 - match_1.at 0 . start 0 . should_equal 0 - match_1.at 0 . end 0 . should_equal 0 - - match_2 = pattern.match "ABC" mode=Regex_Mode.All - match_2.length . should_equal 4 - match_2.at 0 . start 0 . should_equal 0 - match_2.at 0 . end 0 . should_equal 0 - match_2.at 1 . start 0 . should_equal 1 - match_2.at 1 . end 0 . should_equal 1 - match_2.at 3 . start 0 . should_equal 3 - match_2.at 3 . end 0 . should_equal 3 - - Test.group "The default regex engine's Pattern.find" <| - engine = Default_Engine.new - - Test.specify "should be able to `find` the first instance of the pattern in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.find input mode=Matching_Mode.First - match . should_be_a Text - match . should_equal "ab" - - Test.specify "should return `Nothing` if there are no matches in first mode" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.find input mode=Matching_Mode.First - match . should_equal Nothing - - Test.specify "should be able to `find` at most N instances of the pattern in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.find input mode=3 - match.length . should_equal 3 - match.at 0 . should_equal "ab" - match.at 1 . should_equal "cd" - match.at 2 . should_equal "ef" - - Test.specify "should `find` fewer than N instances when there are fewer than N in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdef" - match = pattern.find input mode=5 - match.length . should_equal 3 - match.at 0 . should_equal "ab" - match.at 1 . should_equal "cd" - match.at 2 . should_equal "ef" - - Test.specify "should return `Nothing` when a counted match fails" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.find input mode=3 - match . should_equal Nothing - - Test.specify "should be able to `find` the all instances of the pattern in the input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.find input mode=Regex_Mode.All - match.length . should_equal 5 - match.at 0 . should_equal "ab" - match.at 1 . should_equal "cd" - match.at 2 . should_equal "ef" - match.at 3 . should_equal "gh" - match.at 4 . should_equal "ij" - - Test.specify "should return `Nothing` when an all match match fails" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.find input mode=Regex_Mode.All - match . should_equal Nothing - - Test.specify "should be able to `find` the pattern against the entire input" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.find input mode=Regex_Mode.Full - match . should_be_a Text - match . should_equal input - - Test.specify "should return `Nothing` if a full find does not match the entire input" <| - pattern = engine.compile "(..)" [] - input = "aa ab" - full_match = pattern.find input mode=Regex_Mode.Full - full_match . should_equal Nothing - - Test.specify "should be able to `find` the pattern against bounded input" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - match = pattern.find input mode=(Regex_Mode.Bounded 2 8) - match.length . should_equal 3 - match.at 0 . should_equal "cd" - match.at 1 . should_equal "ef" - match.at 2 . should_equal "gh" - - match_2 = pattern.find input mode=(Regex_Mode.Bounded 2 8 mode=10) - match_2.length . should_equal 3 - match_2.at 0 . should_equal "cd" - match_2.at 1 . should_equal "ef" - match_2.at 2 . should_equal "gh" - - match_3 = pattern.find input mode=(Regex_Mode.Bounded 2 8 mode=2) - match_3.length . should_equal 2 - match_3.at 0 . should_equal "cd" - match_3.at 1 . should_equal "ef" - - Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <| - engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"] - engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"] - engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"] - - engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"] - engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"] - engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"] - engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Regex_Mode.Bounded 1 3) . should_equal ["1", "b"] - - Test.group "The default regex engine's Pattern.split" <| - engine = Default_Engine.new - - Test.specify "should be able to `split` on the first instance of the pattern" <| - pattern = engine.compile "cd" [] - input = "abcdefghij" - match = pattern.split input mode=Matching_Mode.First - match.length . should_equal 2 - match.at 0 . should_equal "ab" - match.at 1 . should_equal "efghij" - - Test.specify "should return the original text if there are no matches in first mode" <| - pattern = engine.compile "(aa)" [] - input = "abcdefghij" - match = pattern.split input mode=Matching_Mode.First - match . should_equal ["abcdefghij"] - - Test.specify "should be able to `split` on at most N instances of the pattern in the input" <| - pattern = engine.compile "a" [] - input = "bacadaeaf" - match = pattern.split input mode=3 - match.length . should_equal 4 - match.at 0 . should_equal "b" - match.at 1 . should_equal "c" - match.at 2 . should_equal "d" - match.at 3 . should_equal "eaf" - - Test.specify "should `split` on fewer than N instances when there are fewer than N in the input" <| - pattern = engine.compile "a" [] - input = "bacadaeaf" - match = pattern.split input mode=10 - match.length . should_equal 5 - match.at 0 . should_equal "b" - match.at 1 . should_equal "c" - match.at 2 . should_equal "d" - match.at 3 . should_equal "e" - match.at 4 . should_equal "f" - - Test.specify "should be able to `split` on the all instances of the pattern in the input" <| - pattern = engine.compile "(a)" [] - input = "bacadaeaf" - match = pattern.split input mode=Regex_Mode.All - match.length . should_equal 5 - match.at 0 . should_equal "b" - match.at 1 . should_equal "c" - match.at 2 . should_equal "d" - match.at 3 . should_equal "e" - match.at 4 . should_equal "f" - - Test.group "The default regex engine's Pattern.replace" <| - engine = Default_Engine.new - - Test.specify "should be able to `replace` the first instance of the pattern in the input" <| - pattern = engine.compile "abc" [] - input = "aa ab abc a bc abc" - match = pattern.replace input "REPLACED" mode=Matching_Mode.First - match . should_be_a Text - match . should_equal "aa ab REPLACED a bc abc" - - Test.specify "should return the string unchanged if there are no matches to replace in first mode" <| - pattern = engine.compile "xyz" [] - input = "aa ab ac ad" - match = pattern.replace input "REPLACED" mode=Matching_Mode.First - match . should_equal input - - Test.specify "should be able to replace at most N instances of the pattern in the input" <| - pattern = engine.compile "aa" [] - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "REPLACED" mode=3 - match . should_equal "REPLACED ab REPLACED ac ad REPLACED aa ax" - - Test.specify "should replace fewer than N instances when there are fewer than N in the input" <| - pattern = engine.compile "aa" [] - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "REPLACED" mode=10 - match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" - - Test.specify "should return the input when a counted replace fails" <| - pattern = engine.compile "aa" [] - input = "abcdefghij" - match = pattern.replace input "REPLACED" mode=3 - match . should_equal input - - Test.specify "should be able to replace the all instances of the pattern in the input" <| - pattern = engine.compile "aa" [] - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "REPLACED" mode=Regex_Mode.All - match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" - - Test.specify "should return the input when an all replace fails" <| - pattern = engine.compile "aa" [] - input = "abcdefghij" - match = pattern.replace input "REPLACED" mode=Regex_Mode.All - match . should_equal input - - Test.specify "should be able to replace the entire input only if it matches" <| - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.replace input "REPLACED" mode=Regex_Mode.Full - match . should_equal "REPLACED" - - Test.specify "should correctly replace entire input in Full mode even if partial matches are possible" <| - pattern = engine.compile "(aa)+" [] - pattern.replace "aaa" "REPLACED" mode=Regex_Mode.Full . should_equal "aaa" - pattern.replace "aaaa" "REPLACED" mode=Regex_Mode.Full . should_equal "REPLACED" - - Test.specify "should return the input for a full replace if the pattern doesn't match the entire input" <| - pattern = engine.compile "(..)" [] - input = "aa ab" - full_match = pattern.replace input "REPLACED" mode=Regex_Mode.Full - full_match . should_equal input - - Test.specify "should not perform overlapping replacements in counted mode" <| - pattern = engine.compile "(..)" [] - input = "abcdefghij" - result = pattern.replace input "REPLACED" mode=3 - result . should_equal "REPLACEDREPLACEDREPLACEDghij" - - Test.specify "should not perform overlapping replacements in all mode" <| - pattern = engine.compile "(..)" [] - input = "aa ab" - match = pattern.replace input "REPLACED" mode=Regex_Mode.All - match . should_equal "REPLACEDREPLACEDb" - - Test.specify "should handle capture groups in replacement" <| - pattern = engine.compile "(?[a-z]+)" [] - pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$1]" mode=0 . should_equal "foo bar, baz" - pattern.replace "foo bar, baz" "[$1]" mode=1 . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[$1]" mode=2 . should_equal "[foo] [bar], baz" - pattern.replace "foo bar, baz" "[$1]" mode=3 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$1]" mode=4 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.First . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]" - - pattern.replace "foo bar, baz" "[${capture}]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[${capture}]" mode=0 . should_equal "foo bar, baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=1 . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=2 . should_equal "[foo] [bar], baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=3 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[${capture}]" mode=4 . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.First . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]" - - Test.specify "should handle capture groups in replacement in All mode" <| - pattern = engine.compile "([a-z]+)" [] - pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.Full . should_equal "foo bar, baz" - pattern.replace "foo" "[$1]" mode=Regex_Mode.Full . should_equal "[foo]" - - pattern_2 = engine.compile '(?.*?)' [] - pattern_2.replace 'content' "$2 <- $1" mode=Regex_Mode.Full . should_equal "content <- url" - pattern_2.replace 'content' "${name} <- ${addr}" mode=Regex_Mode.Full . should_equal "content <- url" - - Test.group "Match.group" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the full match with index 0" <| - match.group 0 . should_equal "aa ab abc a bc bcd" - - Test.specify "should return the group contents if it matches by index" <| - match.group 1 . should_equal "aa ab " - - Test.specify "should return the group contents if it matches by name" <| - match.group "letters" . should_equal "abc a bc bcd" - - Test.specify "should return Nothing if the group did not match" <| - match.group 3 . should_equal Nothing - - Test.specify "should fail with No_Such_Group_Error if the group did not exist" <| - match.group "fail" . should_fail_with No_Such_Group - match.group 5 . should_fail_with No_Such_Group - - Test.specify "should make named groups accessible by index" <| - match.group 2 . should_equal (match.group "letters") - - Test.group "Match.groups" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the results of all groups" <| - groups = match.groups - groups.length . should_equal 5 - groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing] - - Test.specify "should replace unmatched groups by a user-specified value" <| - groups = match.groups "UNMATCHED" - groups.length . should_equal 5 - groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"] - - Test.group "Match.named_groups" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the results of all named groups" <| - groups = match.named_groups - groups.size . should_equal 2 - groups.at "letters" . should_equal "abc a bc bcd" - groups.at "empty" . should_equal Nothing - - Test.specify "should replace unmatched groups by a user-specified value" <| - groups = match.named_groups "UNMATCHED" - groups.size . should_equal 2 - groups.at "letters" . should_equal "abc a bc bcd" - groups.at "empty" . should_equal "UNMATCHED" - - Test.group "Match.start" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the start of a group by index" <| - match.start 1 . should_equal 0 - - Test.specify "should return the start of a group by name" <| - match.start "letters" . should_equal 6 - - Test.specify "should return Nothing if the group didn't match" <| - match.start 3 . should_equal Nothing - match.start "empty" . should_equal Nothing - - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.start 5 . should_fail_with No_Such_Group - match.start "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.end" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the end of a group by index" <| - match.end 1 . should_equal 6 - - Test.specify "should return the end of a group by name" <| - match.end "letters" . should_equal 18 - - Test.specify "should return Nothing if the group didn't match" <| - match.end 3 . should_equal Nothing - match.end "empty" . should_equal Nothing - - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.end 5 . should_fail_with No_Such_Group - match.end "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.span" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should get the span of a group by index" <| - match.span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input) - - Test.specify "should get the span of a group by name" <| - match.span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input) - - Test.specify "should return Nothing if the group didn't match" <| - match.span 3 . should_equal Nothing - match.span "empty" . should_equal Nothing - - Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| - match.span 5 . should_fail_with No_Such_Group - match.span "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.start_position" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the region start over which self match was performed" <| - match.start_position . should_equal 0 - - Test.group "Match.end_position" <| - engine = Default_Engine.new - pattern = engine.compile "(.. .. )(?.+)()??(?)??" [] - input = "aa ab abc a bc bcd" - match = pattern.match input mode=Matching_Mode.First - - Test.specify "should be a Match" <| - match . should_be_a Default_Engine.Match.Value - - Test.specify "should return the region end over which self match was performed" <| - match.end_position . should_equal 18 - -main = Test_Suite.run_main spec diff --git a/test/Tests/src/Data/Text/Matching_Spec.enso b/test/Tests/src/Data/Text/Matching_Spec.enso deleted file mode 100644 index 7402c27adc..0000000000 --- a/test/Tests/src/Data/Text/Matching_Spec.enso +++ /dev/null @@ -1,78 +0,0 @@ -from Standard.Base import all -import Standard.Base.Data.Text.Matching -import Standard.Base.Errors.Common.No_Such_Method - -from Standard.Test import Test, Test_Suite, Problems -import Standard.Test.Extensions - -type Foo_Error - -spec = Test.group 'Matching Helper' <| - Test.specify 'should match a single name with a single Text_Matcher criterion' <| - Text_Matcher.Case_Sensitive.match_single_criterion "foo" "foo" . should_be_true - Text_Matcher.Case_Sensitive.match_single_criterion "foobar" "foo" . should_be_false - Text_Matcher.Case_Sensitive.match_single_criterion "foo" "f.*" . should_be_false - Text_Matcher.Case_Sensitive.match_single_criterion "foo" "Foo" . should_be_false - - Test.specify 'should correctly handle Unicode folding with Text_Matcher matching' <| - Text_Matcher.Case_Sensitive.match_single_criterion '\u00E9' '\u0065\u{301}' . should_be_true - Text_Matcher.Case_Sensitive.match_single_criterion 'é' '\u00E9' . should_be_true - Text_Matcher.Case_Sensitive.match_single_criterion 'é' 'ę' . should_be_false - - Test.specify 'should match a single name with a single regex criterion' <| - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "foo" . should_be_true - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foobar" "foo" . should_be_false - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "f.*" . should_be_true - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "foo.*" . should_be_true - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "F.*" . should_be_false - - Test.specify 'should support case-insensitive matching' <| - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive).match_single_criterion "foo" "F.*" . should_be_true - Text_Matcher.Case_Insensitive.match_single_criterion "foO" "FOo" . should_be_true - - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive).match_single_criterion "foo" "fF.*" . should_be_false - Text_Matcher.Case_Insensitive.match_single_criterion "foo" "Foos" . should_be_false - - # Small beta is equal to capital 'beta' which looks the same as capital 'b' but is a different symbol. - Text_Matcher.Case_Insensitive.match_single_criterion "β" "Β" . should_be_true - Text_Matcher.Case_Insensitive.match_single_criterion "β" "B" . should_be_false - - Test.specify 'should match a list of names with a list of criteria, correctly handling reordering' <| - Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "foo"] reorder=True . should_equal ["baz", "foo"] - Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "foo"] reorder=False . should_equal ["foo", "baz"] - - Test.specify 'should allow multiple matches to a single criterion (Regex)' <| - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "quux"] ["b.*"] reorder=True . should_equal ["bar", "baz"] - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "quux"] ["b.*", "foo"] reorder=False . should_equal ["foo", "bar", "baz"] - - Test.specify 'should include the object only with the first criterion that matched it, avoiding duplication' <| - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "zap"] [".*z.*", "b.*"] reorder=True . should_equal ["baz", "zap", "bar"] - (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "zap"] [".*z.*", "b.*"] reorder=False . should_equal ["bar", "baz", "zap"] - - Test.specify 'should correctly handle criteria which did not match anything' <| - action = Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "unknown_column"] reorder=True on_problems=_ - tester = _.should_equal ["baz"] - problems = [Matching.No_Matches_Found.Error ["unknown_column"]] - Problems.test_problem_handling action problems tester - - action_2 = Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "unknown_column_1", "unknown_column_2"] reorder=False on_problems=_ - problems_2 = [Matching.No_Matches_Found.Error ["unknown_column_1", "unknown_column_2"]] - Problems.test_problem_handling action_2 problems_2 tester - - Test.specify 'should correctly work with complex object using a function extracting their names' <| - pairs = [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10] - selected = [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0] - Text_Matcher.Case_Sensitive.match_criteria pairs ["bar", "foo"] reorder=True name_mapper=_.first . should_equal selected - - Text_Matcher.Case_Sensitive.match_criteria [1, 2, 3] ["2"] name_mapper=_.to_text . should_equal [2] - - Test.specify 'should correctly forward errors' <| - Text_Matcher.Case_Sensitive.match_criteria (Error.throw Foo_Error) [] . should_fail_with Foo_Error - Text_Matcher.Case_Sensitive.match_criteria [] (Error.throw Foo_Error) . should_fail_with Foo_Error - (Error.throw Foo_Error).match_criteria [] [] . should_fail_with Foo_Error - Text_Matcher.Case_Sensitive.match_criteria [1, 2, 3] ["2"] name_mapper=(x-> if x == 3 then Error.throw Foo_Error else x.to_text) . should_fail_with Foo_Error - - Test.expect_panic_with matcher=No_Such_Method <| - Text_Matcher.Case_Sensitive.match_criteria ["a"] ["a"] name_mapper=_.nonexistent_function - -main = Test_Suite.run_main spec diff --git a/test/Tests/src/Data/Text/Regex_2_Spec.enso b/test/Tests/src/Data/Text/Regex_2_Spec.enso deleted file mode 100644 index 0f2401a889..0000000000 --- a/test/Tests/src/Data/Text/Regex_2_Spec.enso +++ /dev/null @@ -1,487 +0,0 @@ -from Standard.Base import all -import Standard.Base.Data.Text.Span.Span -import Standard.Base.Data.Text.Span.Utf_16_Span -import Standard.Base.Data.Text.Regex.Match_2.Match_2 -import Standard.Base.Data.Text.Regex.Pattern_2.Pattern_2 -import Standard.Base.Data.Text.Regex.Replacer.Replacer -import Standard.Base.Data.Text.Regex_2 -import Standard.Base.Data.Text.Regex_2.No_Such_Group -import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error -import Standard.Base.Errors.Illegal_Argument.Illegal_Argument - -from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup - -from Standard.Test import Test, Test_Suite -import Standard.Test.Extensions - -polyglot java import org.enso.base.Replacer_Cache - -spec = - Test.group "Compile" <| - Test.specify "should be able to be compiled" <| - pattern = Regex_2.compile "(?..)" case_insensitive=True - pattern . should_be_a Pattern_2 - - Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <| - Regex_2.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error - - Test.specify "should disallow empty patterns in `compile`" <| - Regex_2.compile "" . should_fail_with Illegal_Argument - - Test.group "Escape" <| - Test.specify "should escape an expression for use as a literal" <| - pattern = "http://example.com" - Regex_2.escape pattern . should_equal "\Qhttp://example.com\E" - - Test.group "Pattern.matches" <| - Test.specify "should return True when the pattern matches against the input" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - pattern.matches input . should_be_true - - Test.specify "should return False when the pattern doesn't match against the input" <| - pattern = Regex_2.compile "aaz" - input = "aa ab abc a bc bcd" - pattern.matches input . should_be_false - - Test.specify "should check for full matches" <| - pattern = Regex_2.compile "f.o" - pattern.matches "foo" . should_be_true - pattern.matches "foobar" . should_be_false - - Test.specify "`matches` with an empty pattern should be an error" <| - pattern = Regex_2.compile "" - pattern.matches "ABC" . should_fail_with Illegal_Argument - - Test.group "Pattern.match" <| - Test.specify "should be able to `match` the first instance of the pattern in the input" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - match.text 0 . should_equal input - - Test.specify "should return `Nothing` if there are no matches in first mode" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "abc" - match = pattern.match input - match . should_equal Nothing - - Test.specify "should be able to `match` the all instances of the pattern in the input" <| - pattern = Regex_2.compile "(..)" - input = "abcdefghij" - matches = pattern.match_all input - matches.length . should_equal 5 - matches.at 0 . text 0 . should_equal "ab" - matches.at 1 . text 0 . should_equal "cd" - matches.at 2 . text 0 . should_equal "ef" - matches.at 3 . text 0 . should_equal "gh" - matches.at 4 . text 0 . should_equal "ij" - - Test.specify "should return `[]` when an all match match fails" <| - pattern = Regex_2.compile "(aa)" - input = "abcdefghij" - match = pattern.match_all input - match . should_equal [] - - Test.specify "`match` with an empty pattern should be an error" <| - pattern = Regex_2.compile "" - pattern.match "ABC" . should_fail_with Illegal_Argument - - Test.specify "`match_all` with an empty pattern should be an error" <| - pattern = Regex_2.compile "" - pattern.match_all "ABC" . should_fail_with Illegal_Argument - - Test.group "Pattern_2.find and .find_all" <| - Test.specify "should be able to `find` the first instance of the pattern in the input" <| - pattern = Regex_2.compile "(..)" - input = "abcdefghij" - match = pattern.find input - match . should_be_a Text - match . should_equal "ab" - - Test.specify "should return `Nothing` if there are no matches in first mode" <| - pattern = Regex_2.compile "(aa)" - input = "abcdefghij" - match = pattern.find input - match . should_equal Nothing - - Test.specify "should be able to `find` the all instances of the pattern in the input" <| - pattern = Regex_2.compile "(..)" - input = "abcdefghij" - match = pattern.find_all input - match.length . should_equal 5 - match.at 0 . should_equal "ab" - match.at 1 . should_equal "cd" - match.at 2 . should_equal "ef" - match.at 3 . should_equal "gh" - match.at 4 . should_equal "ij" - - Test.specify "should return `[]` when an all match match fails" <| - pattern = Regex_2.compile "(aa)" - input = "abcdefghij" - match = pattern.find_all input - match . should_equal [] - - Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <| - Regex_2.compile "(a+|1+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"] - Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"] - Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"] - - Test.specify "`find` with an empty pattern should be an error" <| - pattern = Regex_2.compile "" - pattern.find "ABC" . should_fail_with Illegal_Argument - - Test.specify "`find_all` with an empty pattern should be an error" <| - pattern = Regex_2.compile "" - pattern.find_all "ABC" . should_fail_with Illegal_Argument - - Test.group "Pattern_2.split" <| - Test.specify "should be able to `split` on the first instance of the pattern" <| - pattern = Regex_2.compile "cd" - input = "abcdefcdghij" - texts = pattern.split input only_first=True - texts . should_equal ["ab", "efcdghij"] - - Test.specify "should return the original text if there are no matches in first mode" <| - pattern = Regex_2.compile "aa" - input = "abcdefghij" - texts = pattern.split input only_first=True - texts . should_equal ["abcdefghij"] - - Test.specify "should return the original text if there are no matches in all mode" <| - pattern = Regex_2.compile "aa" - input = "abcdefghij" - texts = pattern.split input - texts . should_equal ["abcdefghij"] - - Test.specify "should be able to `split` on the all instances of the pattern in the input" <| - pattern = Regex_2.compile "a" - pattern.split "bacadaeaf" . should_equal ["b", "c", "d", "e", "f"] - pattern.split "baab" . should_equal ["b", "", "b"] - pattern.split "aaa" . should_equal ["", "", "", ""] - pattern.split "" . should_equal [""] - pattern.split "a" . should_equal ["", ""] - pattern.split "abaca" . should_equal ["", "b", "c", ""] - - Test.specify "should split without normalization" <| - pattern = Regex_2.compile "s" - pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb'] - - Test.group "Pattern_2.tokenize" <| - Test.specify "can tokenize with simple regexes without capturing groups" - Regex_2.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"] - Regex_2.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"] - Regex_2.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"] - - Test.specify "can tokenize with regexes with capturing groups" - Regex_2.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] - Regex_2.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"] - Regex_2.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""] - - Test.specify "ignores non-capturing groups" - Regex_2.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] - Regex_2.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] - Regex_2.compile "(?\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] - Regex_2.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"] - - Test.specify "ignores nested groups" - Regex_2.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] - Regex_2.compile "(?\d(?\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] - Regex_2.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"] - Regex_2.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"] - - Test.specify "non-participating groups are rendered as the empty string" - Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_4_0" . should_equal ['340'] - Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_q_0" . should_equal ['3q0'] - - Test.specify "handles unicode" <| - Regex_2.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"] - # `+` only applies to the accent `\u{301}`, not to the entire grapheme. - Regex_2.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}'] - Regex_2.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}a\u{301}'] - Regex_2.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"] - - Test.specify "examples are correct" <| - Regex_2.compile "..." . tokenize "ABCDEF" . should_equal ["ABC","DEF"] - Regex_2.compile "(.).(.)" . tokenize "ABCDEF" . should_equal ["AC","DF"] - Regex_2.compile "(\S+)(?:\s+|$)" . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' . should_equal ["Hello","Big","Wide","World","Goodbye!"] - - Test.group "Pattern_2.replace" <| - Test.specify "should be able to `replace` the first instance of the pattern in the input" <| - pattern = Regex_2.compile "abc" - input = "aa ab abc a bc abc" - match = pattern.replace input "REPLACED" only_first=True - match . should_be_a Text - match . should_equal "aa ab REPLACED a bc abc" - - Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <| - pattern = Regex_2.compile "xyz" - input = "aa ab ac ad" - match = pattern.replace input "REPLACED" only_first=True - match . should_equal input - - Test.specify "should be able to replace the all instances of the pattern in the input" <| - pattern = Regex_2.compile "aa" - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "REPLACED" - match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" - - Test.specify "should return the input when an all replace fails" <| - pattern = Regex_2.compile "aa" - input = "abcdefghij" - match = pattern.replace input "REPLACED" - match . should_equal input - - Test.specify "should be able to replace the entire input only if it matches" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.replace input "REPLACED" - match . should_equal "REPLACED" - - Test.specify "should not perform overlapping replacements in all mode" <| - pattern = Regex_2.compile "(..)" - input = "aa ab" - match = pattern.replace input "REPLACED" - match . should_equal "REPLACEDREPLACEDb" - - Test.specify "should handle capture groups in replacement" <| - pattern = Regex_2.compile "(?[a-z]+)" - pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz" - - pattern.replace "foo bar, baz" "[$]" . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$]" only_first=True . should_equal "[foo] bar, baz" - - pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz" - pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]" - pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz" - - Test.specify "should handle unicode in capture group names" <| - pattern = Regex_2.compile "(?<건반>[a-z]+)" - pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]" - - Text.group "should correctly evaluate documentation examples" <| - Test.specify "example 1" <| - pattern = Regex_2.compile 'aa' - pattern.replace 'aaa' 'b' . should_equal 'ba' - - Test.specify "example 2" <| - pattern = Regex_2.compile '[lo]' - pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!' - - Test.specify "example 3" <| - pattern = Regex_2.compile 'l' - pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!' - - Test.specify "example 4" <| - pattern = Regex_2.compile '"(.*?)"' - pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz' - - Test.specify "example 5" <| - pattern = Regex_2.compile "aa" - input = "aa ab aa ac ad aa aa ax" - match = pattern.replace input "xyz" - match . should_equal "xyz ab xyz ac ad xyz xyz ax" - - Test.specify "example 6" <| - pattern = Regex_2.compile "([a-z]+)" - pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]" - - Test.specify "`replace` with an empty pattern should be an error" <| - pattern = Regex_2.compile "" - pattern.replace "ABC" . should_fail_with Illegal_Argument - - Test.group "Match.text" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should return the full match with index 0" <| - match.text 0 . should_equal "aa ab abc a bc bcd" - - Test.specify "should return the group contents if it matches by index" <| - match.text 1 . should_equal "aa ab " - - Test.specify "should return the group contents if it matches by name" <| - match.text "letters" . should_equal "abc a bc bcd" - - Test.specify "should return Nothing if the group did not match" <| - match.text 3 . should_equal Nothing - - Test.specify "should fail with No_Such_Group_Error if the group did not exist" <| - match.text "fail" . should_fail_with No_Such_Group - match.text 5 . should_fail_with No_Such_Group - - Test.specify "should make named groups accessible by index" <| - match.text 2 . should_equal (match.text "letters") - - Test.group "Match.groups" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should return the results of all groups" <| - groups = match.groups - groups.length . should_equal 5 - groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing] - - Test.specify "should replace unmatched groups by a user-specified value" <| - groups = match.groups "UNMATCHED" - groups.length . should_equal 5 - groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"] - - Test.group "Match.named_groups" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2.Value - - Test.specify "should return the results of all named groups" <| - groups = match.named_groups - groups.keys.sort . should_equal ["empty", "letters"] - groups.size . should_equal 2 - groups.at "letters" . should_equal "abc a bc bcd" - groups.at "empty" . should_equal Nothing - - Test.specify "should replace unmatched groups by a user-specified value" <| - groups = match.named_groups "UNMATCHED" - groups.size . should_equal 2 - groups.at "letters" . should_equal "abc a bc bcd" - groups.at "empty" . should_equal "UNMATCHED" - - Test.group "Match.start" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should return the start of a group by index" <| - match.start 1 . should_equal 0 - - Test.specify "should return the start of a group by name" <| - match.start "letters" . should_equal 6 - - Test.specify "should return Nothing if the group didn't match" <| - match.start 3 . should_equal Nothing - match.start "empty" . should_equal Nothing - - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.start 5 . should_fail_with No_Such_Group - match.start "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.end" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should return the end of a group by index" <| - match.end 1 . should_equal 6 - - Test.specify "should return the end of a group by name" <| - match.end "letters" . should_equal 18 - - Test.specify "should return Nothing if the group didn't match" <| - match.end 3 . should_equal Nothing - match.end "empty" . should_equal Nothing - - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.end 5 . should_fail_with No_Such_Group - match.end "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.utf_16_start" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should return the start of a group by index" <| - match.utf_16_start 1 . should_equal 0 - - Test.specify "should return the start of a group by name" <| - match.utf_16_start "letters" . should_equal 6 - - Test.specify "should return Nothing if the group didn't match" <| - match.utf_16_start 3 . should_equal Nothing - match.utf_16_start "empty" . should_equal Nothing - - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.utf_16_start 5 . should_fail_with No_Such_Group - match.utf_16_start "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.utf_16_end" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should return the end of a group by index" <| - match.utf_16_end 1 . should_equal 6 - - Test.specify "should return the end of a group by name" <| - match.utf_16_end "letters" . should_equal 18 - - Test.specify "should return Nothing if the group didn't match" <| - match.utf_16_end 3 . should_equal Nothing - match.utf_16_end "empty" . should_equal Nothing - - Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| - match.utf_16_end 5 . should_fail_with No_Such_Group - match.utf_16_end "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.span" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should get the span of a group by index" <| - match.span 1 . should_equal (Span.Value (0.up_to 6) input) - - Test.specify "should get the span of a group by name" <| - match.span "letters" . should_equal (Span.Value (6.up_to 18) input) - - Test.specify "should return Nothing if the group didn't match" <| - match.span 3 . should_equal Nothing - match.span "empty" . should_equal Nothing - - Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| - match.span 5 . should_fail_with No_Such_Group - match.span "nonexistent" . should_fail_with No_Such_Group - - Test.group "Match.utf_16_span" <| - pattern = Regex_2.compile "(.. .. )(?.+)()??(?)??" - input = "aa ab abc a bc bcd" - match = pattern.match input - match . should_be_a Match_2 - - Test.specify "should get the UTF16 span of a group by index" <| - match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input) - - Test.specify "should get the UTF16 span of a group by name" <| - match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input) - - Test.specify "should return Nothing if the group didn't match" <| - match.utf_16_span 3 . should_equal Nothing - match.utf_16_span "empty" . should_equal Nothing - - Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| - match.utf_16_span 5 . should_fail_with No_Such_Group - match.utf_16_span "nonexistent" . should_fail_with No_Such_Group - - Test.group "caching" <| - Test.specify "Replacer cache drops old values" <| - pattern = Regex_2.compile('([a-c])') - - # Add enough values to flush out the first values. - 0.up_to get_lru_size+1 . map i-> - result = pattern.replace "abcdef" ("$1$1x" + i.to_text) - result . should_not_equal Nothing - replacer_cache_lookup "$1$1x0" . should_equal Nothing - replacer_cache_lookup "$1$1x1" . should_not_equal Nothing - -main = Test_Suite.run_main spec diff --git a/test/Tests/src/Data/Text/Regex_Spec.enso b/test/Tests/src/Data/Text/Regex_Spec.enso index 9df40c71cb..99449f207e 100644 --- a/test/Tests/src/Data/Text/Regex_Spec.enso +++ b/test/Tests/src/Data/Text/Regex_Spec.enso @@ -1,30 +1,507 @@ from Standard.Base import all +import Standard.Base.Data.Text.Span.Span +import Standard.Base.Data.Text.Span.Utf_16_Span +import Standard.Base.Data.Text.Regex +import Standard.Base.Data.Text.Regex.Match.Match +import Standard.Base.Data.Text.Regex.No_Such_Group +import Standard.Base.Data.Text.Regex.Pattern.Pattern +import Standard.Base.Data.Text.Regex.Regex_Syntax_Error +import Standard.Base.Data.Text.Regex.Replacer.Replacer +import Standard.Base.Errors.Common.Type_Error +import Standard.Base.Errors.Illegal_Argument.Illegal_Argument -import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine +from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup from Standard.Test import Test, Test_Suite import Standard.Test.Extensions +polyglot java import org.enso.base.Replacer_Cache + spec = - Test.group "Regex options handling" <| - Test.specify "should work properly with flag options" <| - flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[] - flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments] - - Test.specify "should properly override vector options" <| - flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[Regex_Option.Multiline, Regex_Option.Case_Insensitive] - flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Case_Insensitive, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments] - - Test.group "Regexes" <| + Test.group "Compile" <| Test.specify "should be able to be compiled" <| pattern = Regex.compile "(?..)" case_insensitive=True - pattern . should_be_a Default_Engine.Pattern.Value - pattern.options . should_equal [Regex_Option.Case_Insensitive] + pattern . should_be_a Pattern - Test.specify "should be able to be escaped" <| - pattern = "http://example.com" - Regex.escape pattern . should_equal "\Qhttp://example.com\E" + Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <| + Regex.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error - ## TODO: Missing tests for No_Such_Group_Error + Test.specify "should disallow empty patterns in `compile`" <| + Regex.compile "" . should_fail_with Illegal_Argument + + Test.group "Escape" <| + Test.specify "should escape an expression for use as a literal" <| + Regex.escape "[a-z\d]+" . should_equal '\\[a-z\\d\\]\\+' + + Test.group "Pattern.matches" <| + Test.specify "should return True when the pattern matches against the input" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + pattern.matches input . should_be_true + + Test.specify "should return False when the pattern doesn't match against the input" <| + pattern = Regex.compile "aaz" + input = "aa ab abc a bc bcd" + pattern.matches input . should_be_false + + Test.specify "should check for full matches" <| + pattern = Regex.compile "f.o" + pattern.matches "foo" . should_be_true + pattern.matches "foobar" . should_be_false + + Test.specify "`matches` with an empty pattern should be an error" <| + pattern = Regex.compile "" + pattern.matches "ABC" . should_fail_with Illegal_Argument + + Test.specify "`matches` against a non-Text should fail with Illegal_Argument" <| + pattern = Regex.compile "abc" + pattern.matches 1 . should_fail_with Type_Error + + Test.group "Pattern.match and .match_all" <| + Test.specify "should be able to `match` the first instance of the pattern in the input" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + match.text 0 . should_equal input + + Test.specify "should return `Nothing` if there are no matches in first mode" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "abc" + match = pattern.match input + match . should_equal Nothing + + Test.specify "should be able to `match` the all instances of the pattern in the input" <| + pattern = Regex.compile "(..)" + input = "abcdefghij" + matches = pattern.match_all input + matches.length . should_equal 5 + matches.at 0 . text 0 . should_equal "ab" + matches.at 1 . text 0 . should_equal "cd" + matches.at 2 . text 0 . should_equal "ef" + matches.at 3 . text 0 . should_equal "gh" + matches.at 4 . text 0 . should_equal "ij" + + Test.specify "should return `[]` when an all match match fails" <| + pattern = Regex.compile "(aa)" + input = "abcdefghij" + match = pattern.match_all input + match . should_equal [] + + Test.specify "`match` with an empty pattern should be an error" <| + pattern = Regex.compile "" + pattern.match "ABC" . should_fail_with Illegal_Argument + + Test.specify "`match_all` with an empty pattern should be an error" <| + pattern = Regex.compile "" + pattern.match_all "ABC" . should_fail_with Illegal_Argument + + Test.specify "`match` against a non-Text should fail with Illegal_Argument" <| + pattern = Regex.compile "abc" + pattern.match 1 . should_fail_with Type_Error + + Test.specify "`match_all` against a non-Text should fail with Illegal_Argument" <| + pattern = Regex.compile "abc" + pattern.match_all 1 . should_fail_with Type_Error + + Test.group "Pattern.find and .find_all" <| + Test.specify "should be able to `find` the first instance of the pattern in the input" <| + pattern = Regex.compile "(..)" + input = "abcdefghij" + match = pattern.find input + match . should_be_a Text + match . should_equal "ab" + + Test.specify "should return `Nothing` if there are no matches in first mode" <| + pattern = Regex.compile "(aa)" + input = "abcdefghij" + match = pattern.find input + match . should_equal Nothing + + Test.specify "should be able to `find` the all instances of the pattern in the input" <| + pattern = Regex.compile "(..)" + input = "abcdefghij" + match = pattern.find_all input + match.length . should_equal 5 + match.at 0 . should_equal "ab" + match.at 1 . should_equal "cd" + match.at 2 . should_equal "ef" + match.at 3 . should_equal "gh" + match.at 4 . should_equal "ij" + + Test.specify "should return `[]` when an all match match fails" <| + pattern = Regex.compile "(aa)" + input = "abcdefghij" + match = pattern.find_all input + match . should_equal [] + + Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <| + Regex.compile "(a+|1+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"] + Regex.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"] + Regex.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"] + + Test.specify "`find` with an empty pattern should be an error" <| + pattern = Regex.compile "" + pattern.find "ABC" . should_fail_with Illegal_Argument + + Test.specify "`find_all` with an empty pattern should be an error" <| + pattern = Regex.compile "" + pattern.find_all "ABC" . should_fail_with Illegal_Argument + + Test.group "Pattern.split" <| + Test.specify "should be able to `split` on the first instance of the pattern" <| + pattern = Regex.compile "cd" + input = "abcdefcdghij" + texts = pattern.split input only_first=True + texts . should_equal ["ab", "efcdghij"] + + Test.specify "should return the original text if there are no matches in first mode" <| + pattern = Regex.compile "aa" + input = "abcdefghij" + texts = pattern.split input only_first=True + texts . should_equal ["abcdefghij"] + + Test.specify "should return the original text if there are no matches in all mode" <| + pattern = Regex.compile "aa" + input = "abcdefghij" + texts = pattern.split input + texts . should_equal ["abcdefghij"] + + Test.specify "should be able to `split` on the all instances of the pattern in the input" <| + pattern = Regex.compile "a" + pattern.split "bacadaeaf" . should_equal ["b", "c", "d", "e", "f"] + pattern.split "baab" . should_equal ["b", "", "b"] + pattern.split "aaa" . should_equal ["", "", "", ""] + pattern.split "" . should_equal [""] + pattern.split "a" . should_equal ["", ""] + pattern.split "abaca" . should_equal ["", "b", "c", ""] + + Test.specify "should split without normalization" <| + pattern = Regex.compile "s" + pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb'] + + Test.specify "`split` against a non-Text should fail with Illegal_Argument" <| + pattern = Regex.compile "abc" + pattern.split 1 . should_fail_with Type_Error + + Test.group "Pattern.tokenize" <| + Test.specify "can tokenize with simple regexes without capturing groups" + Regex.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"] + Regex.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"] + Regex.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"] + + Test.specify "can tokenize with regexes with capturing groups" + Regex.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] + Regex.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"] + Regex.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""] + + Test.specify "ignores non-capturing groups" + Regex.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] + Regex.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] + Regex.compile "(?\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] + Regex.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"] + + Test.specify "ignores nested groups" + Regex.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] + Regex.compile "(?\d(?\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"] + Regex.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"] + Regex.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"] + + Test.specify "non-participating groups are rendered as the empty string" + Regex.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_4_0" . should_equal ['340'] + Regex.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_q_0" . should_equal ['3q0'] + + Test.specify "handles unicode" <| + Regex.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"] + # `+` only applies to the accent `\u{301}`, not to the entire grapheme. + Regex.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}'] + Regex.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}a\u{301}'] + Regex.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"] + + Test.specify "examples are correct" <| + Regex.compile "..." . tokenize "ABCDEF" . should_equal ["ABC","DEF"] + Regex.compile "(.).(.)" . tokenize "ABCDEF" . should_equal ["AC","DF"] + Regex.compile "(\S+)(?:\s+|$)" . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' . should_equal ["Hello","Big","Wide","World","Goodbye!"] + + Test.group "Pattern.replace" <| + Test.specify "should be able to `replace` the first instance of the pattern in the input" <| + pattern = Regex.compile "abc" + input = "aa ab abc a bc abc" + match = pattern.replace input "REPLACED" only_first=True + match . should_be_a Text + match . should_equal "aa ab REPLACED a bc abc" + + Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <| + pattern = Regex.compile "xyz" + input = "aa ab ac ad" + match = pattern.replace input "REPLACED" only_first=True + match . should_equal input + + Test.specify "should be able to replace the all instances of the pattern in the input" <| + pattern = Regex.compile "aa" + input = "aa ab aa ac ad aa aa ax" + match = pattern.replace input "REPLACED" + match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax" + + Test.specify "should return the input when an all replace fails" <| + pattern = Regex.compile "aa" + input = "abcdefghij" + match = pattern.replace input "REPLACED" + match . should_equal input + + Test.specify "should be able to replace the entire input only if it matches" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.replace input "REPLACED" + match . should_equal "REPLACED" + + Test.specify "should not perform overlapping replacements in all mode" <| + pattern = Regex.compile "(..)" + input = "aa ab" + match = pattern.replace input "REPLACED" + match . should_equal "REPLACEDREPLACEDb" + + Test.specify "should handle capture groups in replacement" <| + pattern = Regex.compile "(?[a-z]+)" + pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz" + + pattern.replace "foo bar, baz" "[$]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$]" only_first=True . should_equal "[foo] bar, baz" + + pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz" + pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]" + pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz" + + Test.specify "should handle unicode in capture group names" <| + pattern = Regex.compile "(?<건반>[a-z]+)" + pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]" + + Text.group "should correctly evaluate documentation examples" <| + Test.specify "example 1" <| + pattern = Regex.compile 'aa' + pattern.replace 'aaa' 'b' . should_equal 'ba' + + Test.specify "example 2" <| + pattern = Regex.compile '[lo]' + pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!' + + Test.specify "example 3" <| + pattern = Regex.compile 'l' + pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!' + + Test.specify "example 4" <| + pattern = Regex.compile '"(.*?)"' + pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz' + + Test.specify "example 5" <| + pattern = Regex.compile "aa" + input = "aa ab aa ac ad aa aa ax" + match = pattern.replace input "xyz" + match . should_equal "xyz ab xyz ac ad xyz xyz ax" + + Test.specify "example 6" <| + pattern = Regex.compile "([a-z]+)" + pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]" + + Test.specify "`replace` with an empty pattern should be an error" <| + pattern = Regex.compile "" + pattern.replace "ABC" . should_fail_with Illegal_Argument + + Test.specify "`replace` against a non-Text should fail with Illegal_Argument" <| + pattern = Regex.compile "abc" + pattern.replace 1 "abc" . should_fail_with Type_Error + + Test.group "Match.text" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should return the full match with index 0" <| + match.text 0 . should_equal "aa ab abc a bc bcd" + + Test.specify "should return the group contents if it matches by index" <| + match.text 1 . should_equal "aa ab " + + Test.specify "should return the group contents if it matches by name" <| + match.text "letters" . should_equal "abc a bc bcd" + + Test.specify "should return Nothing if the group did not match" <| + match.text 3 . should_equal Nothing + + Test.specify "should fail with No_Such_Group_Error if the group did not exist" <| + match.text "fail" . should_fail_with No_Such_Group + match.text 5 . should_fail_with No_Such_Group + + Test.specify "should make named groups accessible by index" <| + match.text 2 . should_equal (match.text "letters") + + Test.group "Match.groups" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should return the results of all groups" <| + groups = match.groups + groups.length . should_equal 5 + groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing] + + Test.specify "should replace unmatched groups by a user-specified value" <| + groups = match.groups "UNMATCHED" + groups.length . should_equal 5 + groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"] + + Test.group "Match.named_groups" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match.Value + + Test.specify "should return the results of all named groups" <| + groups = match.named_groups + groups.keys.sort . should_equal ["empty", "letters"] + groups.size . should_equal 2 + groups.at "letters" . should_equal "abc a bc bcd" + groups.at "empty" . should_equal Nothing + + Test.specify "should replace unmatched groups by a user-specified value" <| + groups = match.named_groups "UNMATCHED" + groups.size . should_equal 2 + groups.at "letters" . should_equal "abc a bc bcd" + groups.at "empty" . should_equal "UNMATCHED" + + Test.group "Match.start" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should return the start of a group by index" <| + match.start 1 . should_equal 0 + + Test.specify "should return the start of a group by name" <| + match.start "letters" . should_equal 6 + + Test.specify "should return Nothing if the group didn't match" <| + match.start 3 . should_equal Nothing + match.start "empty" . should_equal Nothing + + Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| + match.start 5 . should_fail_with No_Such_Group + match.start "nonexistent" . should_fail_with No_Such_Group + + Test.group "Match.end" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should return the end of a group by index" <| + match.end 1 . should_equal 6 + + Test.specify "should return the end of a group by name" <| + match.end "letters" . should_equal 18 + + Test.specify "should return Nothing if the group didn't match" <| + match.end 3 . should_equal Nothing + match.end "empty" . should_equal Nothing + + Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| + match.end 5 . should_fail_with No_Such_Group + match.end "nonexistent" . should_fail_with No_Such_Group + + Test.group "Match.utf_16_start" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should return the start of a group by index" <| + match.utf_16_start 1 . should_equal 0 + + Test.specify "should return the start of a group by name" <| + match.utf_16_start "letters" . should_equal 6 + + Test.specify "should return Nothing if the group didn't match" <| + match.utf_16_start 3 . should_equal Nothing + match.utf_16_start "empty" . should_equal Nothing + + Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| + match.utf_16_start 5 . should_fail_with No_Such_Group + match.utf_16_start "nonexistent" . should_fail_with No_Such_Group + + Test.group "Match.utf_16_end" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should return the end of a group by index" <| + match.utf_16_end 1 . should_equal 6 + + Test.specify "should return the end of a group by name" <| + match.utf_16_end "letters" . should_equal 18 + + Test.specify "should return Nothing if the group didn't match" <| + match.utf_16_end 3 . should_equal Nothing + match.utf_16_end "empty" . should_equal Nothing + + Test.specify "should return No_Such_Group_Error if the group doesn't exist" <| + match.utf_16_end 5 . should_fail_with No_Such_Group + match.utf_16_end "nonexistent" . should_fail_with No_Such_Group + + Test.group "Match.span" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should get the span of a group by index" <| + match.span 1 . should_equal (Span.Value (0.up_to 6) input) + + Test.specify "should get the span of a group by name" <| + match.span "letters" . should_equal (Span.Value (6.up_to 18) input) + + Test.specify "should return Nothing if the group didn't match" <| + match.span 3 . should_equal Nothing + match.span "empty" . should_equal Nothing + + Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| + match.span 5 . should_fail_with No_Such_Group + match.span "nonexistent" . should_fail_with No_Such_Group + + Test.group "Match.utf_16_span" <| + pattern = Regex.compile "(.. .. )(?.+)()??(?)??" + input = "aa ab abc a bc bcd" + match = pattern.match input + match . should_be_a Match + + Test.specify "should get the UTF16 span of a group by index" <| + match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input) + + Test.specify "should get the UTF16 span of a group by name" <| + match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input) + + Test.specify "should return Nothing if the group didn't match" <| + match.utf_16_span 3 . should_equal Nothing + match.utf_16_span "empty" . should_equal Nothing + + Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <| + match.utf_16_span 5 . should_fail_with No_Such_Group + match.utf_16_span "nonexistent" . should_fail_with No_Such_Group + + Test.group "caching" <| + Test.specify "Replacer cache drops old values" <| + pattern = Regex.compile('([a-c])') + + # Add enough values to flush out the first values. + 0.up_to get_lru_size+1 . map i-> + result = pattern.replace "abcdef" ("$1$1x" + i.to_text) + result . should_not_equal Nothing + replacer_cache_lookup "$1$1x0" . should_equal Nothing + replacer_cache_lookup "$1$1x1" . should_not_equal Nothing main = Test_Suite.run_main spec diff --git a/test/Tests/src/Data/Text_Spec.enso b/test/Tests/src/Data/Text_Spec.enso index 376dd791c5..d0d20f136c 100644 --- a/test/Tests/src/Data/Text_Spec.enso +++ b/test/Tests/src/Data/Text_Spec.enso @@ -1,6 +1,6 @@ from Standard.Base import all -import Standard.Base.Data.Text.Regex_2.No_Such_Group -import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error +import Standard.Base.Data.Text.Regex.No_Such_Group +import Standard.Base.Data.Text.Regex.Regex_Syntax_Error import Standard.Base.Data.Text.Span.Span import Standard.Base.Data.Text.Span.Utf_16_Span import Standard.Base.Errors.Common.Index_Out_Of_Bounds @@ -9,8 +9,6 @@ import Standard.Base.Errors.Common.Type_Error import Standard.Base.Errors.Illegal_Argument.Illegal_Argument import Standard.Base.IO -import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine - from Standard.Base.Data.Text.Text_Sub_Range.Text_Sub_Range import all from Standard.Base.Data.Index_Sub_Range.Index_Sub_Range import all diff --git a/test/Tests/src/Data/Vector_Spec.enso b/test/Tests/src/Data/Vector_Spec.enso index 10e37ff7d4..be5824f80d 100644 --- a/test/Tests/src/Data/Vector_Spec.enso +++ b/test/Tests/src/Data/Vector_Spec.enso @@ -223,7 +223,6 @@ spec = Test.group "Vectors" <| ["abab", "aaabaaaa"].filter (Filter_Condition.Like "_ba_") . should_equal ["abab"] ["abab", "aaabaaaa"].filter (Filter_Condition.Like "%ba__%") . should_equal ["aaabaaaa"] ["aaaa", "bbbbb", "[ab]aaaa"].filter (Filter_Condition.Like "[ab]%") . should_equal ["[ab]aaaa"] - ["a\Qa\Eabb", "aaabb"].filter (Filter_Condition.Like "_\Qa\Ea%") . should_equal ["a\Qa\Eabb"] ["f.txt", "abc.*"].filter (Filter_Condition.Like "%.*") . should_equal ["abc.*"] ["f.txt", "abc.*"].filter (Filter_Condition.Not_Like "%.*") . should_equal ["f.txt"] diff --git a/test/Tests/src/Main.enso b/test/Tests/src/Main.enso index bdd8c7eef5..6e0d35ebfa 100644 --- a/test/Tests/src/Main.enso +++ b/test/Tests/src/Main.enso @@ -50,9 +50,7 @@ import project.Data.Regression_Spec import project.Data.Text_Spec import project.Data.Text.Text_Sub_Range_Spec -import project.Data.Text.Default_Regex_Engine_Spec import project.Data.Text.Encoding_Spec -import project.Data.Text.Matching_Spec import project.Data.Text.Regex_Spec import project.Data.Text.Span_Spec import project.Data.Text.Utils_Spec @@ -126,10 +124,8 @@ main = Test_Suite.run_main <| Problems_Spec.spec Range_Spec.spec Ref_Spec.spec - Lazy_Spec.spec - Default_Regex_Engine_Spec.spec Regex_Spec.spec - Matching_Spec.spec + Lazy_Spec.spec Runtime_Spec.spec Self_Type_Spec.spec Span_Spec.spec