mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 11:52:59 +03:00
Remove old (Java) Regex library and replace with new (Truffle) library. (#6195)
Remove old (Java) Regex library and replace with new (Truffle) library.
This commit is contained in:
parent
2531aeeece
commit
d9bc5246ba
@ -100,13 +100,11 @@ type Filter_Condition
|
||||
Table operations, it can accept another column - then the corresponding
|
||||
values from the source column and the provided column are checked.
|
||||
|
||||
! Known Bugs
|
||||
There is a known bug in Java Regex where escape characters are not
|
||||
handled properly in Unicode-normalized matching mode. Due to this
|
||||
limitation, Unicode normalization has been disabled for this function,
|
||||
so beware that some equivalent graphemes like 'ś' and 's\u0301' will
|
||||
not be matched.
|
||||
See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926
|
||||
! Known Limitations.
|
||||
The Truffle regex engine does not transparently handle normalization.
|
||||
Due to this limitation, Unicode normalization has been disabled for
|
||||
this function, so beware that some equivalent graphemes like 'ś' and
|
||||
's\u0301' will not be matched.
|
||||
Like pattern:Text
|
||||
|
||||
## Does the value not match the SQL pattern (Text only)?
|
||||
@ -121,13 +119,11 @@ type Filter_Condition
|
||||
Table operations, it can accept another column - then the corresponding
|
||||
values from the source column and the provided column are checked.
|
||||
|
||||
! Known Bugs
|
||||
There is a known bug in Java Regex where escape characters are not
|
||||
handled properly in Unicode-normalized matching mode. Due to this
|
||||
limitation, Unicode normalization has been disabled for this function,
|
||||
so beware that some equivalent graphemes like 'ś' and 's\u0301' will
|
||||
not be matched.
|
||||
See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926
|
||||
! Known Limitations.
|
||||
The Truffle regex engine does not transparently handle normalization.
|
||||
Due to this limitation, Unicode normalization has been disabled for
|
||||
this function, so beware that some equivalent graphemes like 'ś' and
|
||||
's\u0301' will not be matched.
|
||||
Not_Like pattern:Text
|
||||
|
||||
## Is the value contained in `values`?
|
||||
@ -212,7 +208,4 @@ type Filter_Condition
|
||||
## PRIVATE
|
||||
sql_like_to_regex sql_pattern =
|
||||
regex_pattern = Regex_Utils.sql_like_pattern_to_regex sql_pattern
|
||||
## There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting.
|
||||
https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926
|
||||
Once that bug is fixed, `match_ascii` may be set back to `False`.
|
||||
Regex.compile regex_pattern dot_matches_newline=True match_ascii=True
|
||||
Regex.compile regex_pattern
|
||||
|
@ -9,7 +9,6 @@ from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
|
||||
|
||||
## Enso's text type.
|
||||
|
||||
Enso's text type is natively unicode aware, and will handle arbitrary
|
||||
|
@ -12,10 +12,9 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Text.Location.Location
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
||||
import project.Data.Text.Regex_2
|
||||
import project.Data.Text.Regex_2.Regex_Syntax_Error
|
||||
import project.Data.Text.Regex.Regex_Syntax_Error
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
@ -233,7 +232,7 @@ Text.characters self =
|
||||
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.match self
|
||||
|
||||
## Finds all the matches of the regular expression `pattern` in `self`,
|
||||
@ -260,7 +259,7 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.match_all self
|
||||
|
||||
## ALIAS Check Matches
|
||||
@ -290,7 +289,7 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.matches self
|
||||
|
||||
## ALIAS Split Text
|
||||
@ -348,7 +347,7 @@ Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_re
|
||||
True -> case delimiter of
|
||||
_ : Text ->
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
|
||||
compiled_pattern = Regex.compile delimiter case_insensitive=case_insensitive
|
||||
compiled_pattern.split self
|
||||
_ : Vector ->
|
||||
parenthesize s = "(?:" + s + ")"
|
||||
@ -383,7 +382,7 @@ Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_re
|
||||
Text.tokenize : Text -> Case_Sensitivity -> Vector Text
|
||||
Text.tokenize self pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.tokenize self
|
||||
|
||||
## ALIAS Replace Text
|
||||
@ -477,7 +476,7 @@ Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive o
|
||||
Text_Utils.replace_spans self spans_array replacement
|
||||
True ->
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
|
||||
compiled_pattern = Regex.compile term case_insensitive=case_insensitive
|
||||
compiled_pattern.replace self replacement only_first
|
||||
|
||||
## ALIAS Get Words
|
||||
|
@ -0,0 +1,17 @@
|
||||
from Standard.Base import all
|
||||
|
||||
import project.Any.Any
|
||||
import project.Data.Locale.Locale
|
||||
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Errors.Common.Type_Error
|
||||
import project.Meta
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Assert that `text_maybe` is a Text, then call the action.
|
||||
expect_text : Any -> Any -> Any ! Type_Error
|
||||
expect_text text_maybe ~action = case text_maybe of
|
||||
_ : Text -> action
|
||||
_ ->
|
||||
Error.throw (Type_Error.Error Text (Meta.type_of text_maybe) "text_maybe")
|
||||
|
@ -1,110 +0,0 @@
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Pair.Pair
|
||||
import project.Data.Range.Extensions
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Problem_Behavior.Problem_Behavior
|
||||
import project.Panic.Panic
|
||||
import project.Panic.Wrapped_Dataflow_Error
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
## UNSTABLE
|
||||
An error indicating that some criteria did not match any names in the input.
|
||||
type No_Matches_Found
|
||||
Error (criteria : Vector Text)
|
||||
|
||||
to_display_text : Text
|
||||
to_display_text self =
|
||||
"The criteria "+self.criteria.to_text+" did not match any names in the input."
|
||||
|
||||
## PRIVATE
|
||||
match_criteria_implementation matcher objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning =
|
||||
result = internal_match_criteria_implementation matcher objects criteria reorder name_mapper
|
||||
unmatched_criteria = result.second
|
||||
problems = if unmatched_criteria.is_empty then [] else
|
||||
[No_Matches_Found.Error unmatched_criteria]
|
||||
on_problems.attach_problems_after result.first problems
|
||||
|
||||
## PRIVATE
|
||||
match_criteria_callback matcher objects criteria problem_callback reorder=False name_mapper=(x->x) =
|
||||
result = internal_match_criteria_implementation matcher objects criteria reorder name_mapper
|
||||
unmatched_criteria = result.second
|
||||
problem_callback unmatched_criteria
|
||||
result.first
|
||||
|
||||
type Match_Matrix
|
||||
## PRIVATE
|
||||
A helper type holding a matrix of matches.
|
||||
Value matrix criteria objects
|
||||
|
||||
# Checks if the ith object is matched by any criterion.
|
||||
is_object_matched_by_anything : Integer -> Boolean
|
||||
is_object_matched_by_anything self i =
|
||||
self.matrix.at i . any x->x
|
||||
|
||||
# Checks if the ith criterion matches any objects.
|
||||
does_criterion_match_anything : Integer -> Boolean
|
||||
does_criterion_match_anything self i =
|
||||
self.matrix.map (col -> col.at i) . any x->x
|
||||
|
||||
## PRIVATE
|
||||
Extracts the list of criteria that did not have any matches.
|
||||
unmatched_criteria self =
|
||||
checked_criteria = self.criteria.map_with_index j-> criterion->
|
||||
has_matches = self.does_criterion_match_anything j
|
||||
Pair.new has_matches criterion
|
||||
checked_criteria.filter (p -> p.first.not) . map .second
|
||||
|
||||
## PRIVATE
|
||||
Returns the list of criteria that match the ith object.
|
||||
criteria_matching_object : Integer -> Vector
|
||||
criteria_matching_object self i =
|
||||
self.criteria.filter_with_index j-> _->
|
||||
self.matrix . at i . at j
|
||||
|
||||
## PRIVATE
|
||||
Returns the list of criteria indices that match the ith object.
|
||||
criteria_indices_matching_object : Integer -> Vector
|
||||
criteria_indices_matching_object self i =
|
||||
(0.up_to self.criteria.length).filter j->
|
||||
self.matrix . at i . at j
|
||||
|
||||
## PRIVATE
|
||||
Generates a matrix specifying which criteria match which object.
|
||||
|
||||
The returned `match_matrix` satisfies the following condition:
|
||||
`match_matrix . at i . at j` is `True` if and only if `objects.at i` matches
|
||||
`criteria.at j`.
|
||||
make_match_matrix matcher objects criteria object_name_mapper=(x->x) criterion_mapper=(x->x) =
|
||||
matrix = objects.map obj->
|
||||
criteria.map criterion->
|
||||
matcher.match_single_criterion (object_name_mapper obj) (criterion_mapper criterion)
|
||||
Match_Matrix.Value matrix criteria objects
|
||||
|
||||
## PRIVATE
|
||||
internal_match_criteria_implementation matcher objects criteria reorder=False name_mapper=(x->x) = Panic.catch Wrapped_Dataflow_Error (handler = x-> x.payload.unwrap) <|
|
||||
## TODO [RW] discuss: this line of code also shows an issue we had with ensuring input dataflow-errors are correctly propagated, later on we stopped doing that and testing for that as it was too cumbersome. Maybe it could be helped with an @Accepts_Error annotation similar to the one from the interpreter???
|
||||
[matcher, objects, criteria, reorder, name_mapper] . each v->
|
||||
Panic.rethrow (v.map_error Wrapped_Dataflow_Error.Error)
|
||||
|
||||
match_matrix = make_match_matrix matcher objects criteria name_mapper
|
||||
unmatched_criteria = match_matrix.unmatched_criteria
|
||||
|
||||
# Selects object indices which satisfy the provided predicate.
|
||||
select_matching_indices : (Integer -> Boolean) -> Vector Text
|
||||
select_matching_indices matcher =
|
||||
0.up_to objects.length . to_vector . filter matcher
|
||||
|
||||
selected_indices = case reorder of
|
||||
True ->
|
||||
nested_indices = 0.up_to criteria.length . map j->
|
||||
is_object_matched_by_this_criterion i =
|
||||
match_matrix.matrix.at i . at j
|
||||
select_matching_indices is_object_matched_by_this_criterion
|
||||
nested_indices.flat_map x->x . distinct
|
||||
False ->
|
||||
select_matching_indices match_matrix.is_object_matched_by_anything
|
||||
|
||||
result = selected_indices.map objects.at
|
||||
Pair.new result unmatched_criteria
|
@ -1,124 +1,63 @@
|
||||
## This module contains the basic interface to the more advanced functionality
|
||||
of Enso's regular expression engine.
|
||||
|
||||
TODO Examples
|
||||
|
||||
import project.Data.Boolean.Boolean
|
||||
import project.Any.Any
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Regex.Engine.Engine
|
||||
import project.Data.Text.Prim_Text_Helper
|
||||
import project.Data.Text.Regex.Pattern.Pattern
|
||||
import project.Data.Text.Regex.Engine.Default
|
||||
import project.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Common.Compile_Error
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Errors.Common import Syntax_Error
|
||||
|
||||
polyglot java import org.enso.base.Regex_Utils
|
||||
|
||||
## Compile the provided `expression` into a regex pattern that can be used for
|
||||
matching.
|
||||
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want to
|
||||
compile.
|
||||
- engine: The regular expression engine to use. It defaults to Enso's
|
||||
built-in one which has good performance and a full feature-set.
|
||||
- match_ascii: Enables or disables pure-ASCII matching for the regex. If you
|
||||
know your data only contains ASCII then you can enable this for a
|
||||
performance boost on some regex engines.
|
||||
compile. Must be non-empty.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
- dot_matches_newline: Enables or disables the dot matches newline option.
|
||||
This specifies that the `.` special character should match everything
|
||||
_including_ newline characters. Without this flag, it will match all
|
||||
characters _except_ newlines.
|
||||
- multiline: Enables or disables the multiline option. Multiline specifies
|
||||
that the `^` and `$` pattern characters match the start and end of lines,
|
||||
as well as the start and end of the input respectively.
|
||||
- comments: Enables or disables the comments mode for the regular expression.
|
||||
In comments mode, the following changes apply:
|
||||
- Whitespace within the pattern is ignored, except when within a
|
||||
character class or when preceded by an unescaped backslash, or within
|
||||
grouping constructs (e.g. `(?...)`).
|
||||
- When a line contains a `#`, that is not in a character class and is not
|
||||
preceded by an unescaped backslash, all characters from the leftmost
|
||||
such `#` to the end of the line are ignored. That is to say, they act
|
||||
as _comments_ in the regex.
|
||||
- extra_opts: Specifies additional options in a vector. This allows options
|
||||
to be supplied and computed without having to break them out into arguments
|
||||
to the function. Where these overlap with one of the flags (`match_ascii`,
|
||||
`case_insensitive`, `dot_matches_newline`, `multiline` and `verbose`), the
|
||||
flags take precedence.
|
||||
|
||||
! Boolean Flags and Extra Options
|
||||
This function contains a number of arguments that are boolean flags that
|
||||
enable or disable common options for the regex. At the same time, it also
|
||||
provides the ability to specify options in the `extra_opts` argument.
|
||||
|
||||
Where one of the flags is _set_ (has the value `True` or `False`), the
|
||||
value of the flag takes precedence over the value in `extra_opts` when
|
||||
merging the options to the engine. The flags are _unset_ (have value
|
||||
`Nothing`) by default.
|
||||
If an empty regex is used, `compile` throws an Illegal_Argument error.
|
||||
|
||||
? Why Compile?
|
||||
While many regex engines are able to cache ad-hoc patterns, it is often
|
||||
useful to be able to manually retain a pattern that you have computed. This
|
||||
function exists so you can hold onto the resultant `Pattern` object,
|
||||
instead of immediately proceeding to match using it.
|
||||
compile : Text -> Engine -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Vector Regex_Option -> Pattern ! Compile_Error
|
||||
compile expression engine=Default.new match_ascii=Nothing case_insensitive=Nothing dot_matches_newline=Nothing multiline=Nothing comments=Nothing extra_opts=[] =
|
||||
options_vec = from_flags match_ascii case_insensitive dot_matches_newline multiline comments extra_opts
|
||||
engine.compile expression options_vec
|
||||
compile : Text -> Boolean | Nothing -> Pattern ! Regex_Syntax_Error | Illegal_Argument
|
||||
compile self expression case_insensitive=Nothing =
|
||||
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
|
||||
options_string = if case_insensitive == True then "usgi" else "usg"
|
||||
|
||||
## Escape the special characters in `expression` such that the result is a valid
|
||||
literal pattern for the original string.
|
||||
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
|
||||
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
|
||||
|
||||
Pattern.Value internal_regex_object
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Escape the special characters in `expression` such that the result is a
|
||||
valid literal pattern for the original string.
|
||||
|
||||
Arguments:
|
||||
- expression: The expression to escape metacharacters in.
|
||||
- engine: The regular expression engine to use. It defaults to Enso's
|
||||
built-in one which has good performance and a full feature-set.
|
||||
|
||||
! Matching Engines
|
||||
Care should be taken to ensure that you use the same engine for escaping
|
||||
and matching, as engine syntax may differ in certain cases.
|
||||
escape : Text -> Engine -> Text
|
||||
escape expression engine=Default.new = engine.escape expression
|
||||
> Example
|
||||
Turn a Text into a regex that matches that string exactly.
|
||||
|
||||
## PRIVATE
|
||||
example_escape =
|
||||
literal_string = "\!\.|abcde"
|
||||
Regex.escape literal_string
|
||||
escape : Text -> Text
|
||||
escape self expression = Regex_Utils.regexQuote expression
|
||||
|
||||
Turns the options flags into a vector of options.
|
||||
from_flags : Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Vector Regex_Option -> Vector Regex_Option
|
||||
from_flags match_ascii case_insensitive dot_matches_newline multiline comments extra_opts =
|
||||
builder = Vector.new_builder
|
||||
|
||||
process_override : Boolean | Nothing -> Regex_Option -> Nothing
|
||||
process_override param option = case param of
|
||||
_ : Boolean -> if param then builder.append option
|
||||
Nothing -> if extra_opts.contains option then builder.append option
|
||||
|
||||
process_override match_ascii Regex_Option.Ascii_Matching
|
||||
process_override case_insensitive Regex_Option.Case_Insensitive
|
||||
process_override dot_matches_newline Regex_Option.Dot_Matches_Newline
|
||||
process_override multiline Regex_Option.Multiline
|
||||
process_override comments Regex_Option.Comments
|
||||
|
||||
## Add any non-overridable options from extra_opts
|
||||
extra_opts.each opt->
|
||||
not_ascii = opt != Regex_Option.Ascii_Matching
|
||||
not_insensitive = opt != Regex_Option.Case_Insensitive
|
||||
not_dot_matches_newline = opt != Regex_Option.Dot_Matches_Newline
|
||||
not_multiline = opt != Regex_Option.Multiline
|
||||
not_comments = opt != Regex_Option.Comments
|
||||
|
||||
if not_ascii && not_insensitive && not_dot_matches_newline && not_multiline && not_comments then
|
||||
builder.append opt
|
||||
|
||||
builder.to_vector
|
||||
|
||||
## PRIVATE
|
||||
|
||||
An error that is emitted when there is no such group in the match for the
|
||||
## An error that is emitted when there is no such group in the match for the
|
||||
provided `id`.
|
||||
|
||||
Arguments:
|
||||
@ -134,46 +73,10 @@ type No_Such_Group
|
||||
_ : Integer -> "No group exists with the index " + self.id.to_text + "."
|
||||
_ : Text -> "No group exists with the name " + self.id + "."
|
||||
|
||||
## PRIVATE
|
||||
|
||||
An error representing that one of the passed options was invalid.
|
||||
|
||||
Arguments:
|
||||
- opt: The option that was not valid for this regex engine.
|
||||
type Invalid_Option
|
||||
Error (opt : Any)
|
||||
|
||||
## A syntax error reported by the Truffle regex compiler.
|
||||
type Regex_Syntax_Error
|
||||
## PRIVATE
|
||||
|
||||
Provides a human-readable representation of the invalid option error.
|
||||
to_display_text : Text
|
||||
to_display_text self =
|
||||
"The option " + self.opt.to_text + " is not valid for the default regex engine."
|
||||
|
||||
## PRIVATE
|
||||
|
||||
An error representing that there is something wrong with the mode for a regex
|
||||
match.
|
||||
|
||||
Arguments:
|
||||
- message: The text of the message to display to users.
|
||||
type Mode_Error
|
||||
Error (message : Text)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Provides a human-readable representation of the mode error.
|
||||
to_display_text : Text
|
||||
to_display_text self = self.message.to_text
|
||||
|
||||
## PRIVATE
|
||||
|
||||
An error representing that the bounds for a match are invalid.
|
||||
type Invalid_Bounds_Error
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Provides a human-readable representation of the invalid bounds error.
|
||||
to_display_text : Text
|
||||
to_display_text =
|
||||
"The start bound cannot be greater than the end bound."
|
||||
Arguments:
|
||||
- message: A description of the erroneous syntax.
|
||||
Error message
|
||||
|
@ -1,51 +0,0 @@
|
||||
## An `Engine` is a configuration and behaviour specification object for a
|
||||
particular regular expression engine.
|
||||
|
||||
An implementation of a regular expression engine must implement the below
|
||||
interface, as well as conform to the following requirements:
|
||||
|
||||
- The engine must operate in a unicode mode by default, using canonical
|
||||
form for equality and the unicode versions of the standard character
|
||||
classes.
|
||||
- It must support the standard options specified in
|
||||
`Standard.Base.Data.Text.Regex.Regex_Option`. It may specify additional,
|
||||
engine-specific options, but this is not required by the specification.
|
||||
- In the defining module, the engine implementation must provide a full
|
||||
specification of its syntax in the module documentation block.
|
||||
|
||||
This file is _not executable_. It instead describes the interface for the
|
||||
customisable `Engine` and `Pattern` types.
|
||||
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
import project.Data.Text.Regex.Invalid_Option
|
||||
import project.Data.Text.Regex.Pattern.Pattern
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Common.Compile_Error
|
||||
import project.Errors.Unimplemented.Unimplemented
|
||||
|
||||
## The `Data.Text.Regex.Engine.Engine` interface.
|
||||
type Engine
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Compile the provided `expression` into a regex pattern that can be used
|
||||
for matching.
|
||||
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want
|
||||
to compile.
|
||||
- options: The options to configure the matching process with. These are
|
||||
merged with the specific `engine_opts`.
|
||||
compile : Text -> Vector Regex_Option -> Pattern ! (Compile_Error | Invalid_Option)
|
||||
compile self _ _ = Unimplemented.throw "This is an interface only."
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Escape the special characters in `expression` such that the result is a
|
||||
valid literal pattern for the original string.
|
||||
|
||||
Arguments:
|
||||
- expression: The expression to escape metacharacters in.
|
||||
escape : Text -> Text
|
||||
escape self _ = Unimplemented.throw "This is an interface only."
|
@ -1,888 +0,0 @@
|
||||
## Enso's default regular expression matching engine.
|
||||
|
||||
Enso's default regular expression engine uses Java's regular expression
|
||||
syntax, extended with support for the unicode character classes and
|
||||
properties. A detailed explanation of the syntax is below.
|
||||
|
||||
! Raw Strings
|
||||
Enso has support for raw strings using the `""` quotes. Within a raw
|
||||
string, all characters are interpreted to mean themselves. This means that
|
||||
you do not need to double-escape special characters in regular expressions.
|
||||
|
||||
! Characters and Regex
|
||||
When the default regex engine provdies a position with regards to
|
||||
"characters", it is referring to positions in terms of the UTF-16
|
||||
characters in the text. These indices must be used to index into the
|
||||
vector of UTF-16 characters. It will otherwise be wrong.
|
||||
|
||||
! Escaping
|
||||
The backslash character `"\"` serves to introduce escaped constructs, as
|
||||
defined in "Syntax Specification" below, as well as to quote characters
|
||||
that would otherwise be interpreted as unescaped constructs. As a result,
|
||||
the expression `"\\"` matches a single backslash, and `"\{"` matches an
|
||||
opening brace.
|
||||
|
||||
It is a parse error for the regular expression to use a backslash prior to
|
||||
any alphabetic character that does not denote an escaped construct. It is,
|
||||
however, valid to put a backslash before any symbolic character.
|
||||
|
||||
? Syntax Specification
|
||||
The syntax supported by the default regular expression engine is described
|
||||
here. The pattern described by the regular expression can then be used to
|
||||
match against text.
|
||||
|
||||
TBC
|
||||
|
||||
import project.Any.Any
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Extensions
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex.Invalid_Option
|
||||
import project.Data.Text.Regex.Invalid_Bounds_Error
|
||||
import project.Data.Text.Regex.Mode_Error
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
||||
import project.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Vector.Vector
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Errors.Common import Compile_Error, Syntax_Error
|
||||
|
||||
polyglot java import java.lang.IllegalArgumentException
|
||||
polyglot java import java.lang.IndexOutOfBoundsException
|
||||
polyglot java import java.lang.StringBuffer
|
||||
polyglot java import java.util.regex.Matcher as Java_Matcher
|
||||
polyglot java import java.util.regex.Pattern as Java_Pattern
|
||||
polyglot java import java.util.regex.PatternSyntaxException
|
||||
|
||||
polyglot java import com.ibm.icu.impl.UnicodeRegex
|
||||
polyglot java import org.enso.base.Regex_Utils
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
|
||||
## Construct an instance of the default engine.
|
||||
|
||||
Arguments:
|
||||
- opts: Any engine-specific options.
|
||||
|
||||
> Example
|
||||
Build a new default engine specifying literal mode.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
|
||||
example_new =
|
||||
engine_opts = [Default_Engine.Option.Literal_Pattern]
|
||||
Default_Engine.new engine_opts
|
||||
new : Vector (Regex_Option | Option) -> Default_Engine
|
||||
new opts=[] = Default_Engine.Value opts
|
||||
|
||||
## The default implementation of the `Data.Text.Regex.Engine.Engine` interface.
|
||||
type Default_Engine
|
||||
|
||||
## PRIVATE
|
||||
|
||||
The default regex engine for Enso.
|
||||
|
||||
Arguments:
|
||||
- engine_opts: Options for regex matching that are specific to this
|
||||
engine.
|
||||
Value (engine_opts : Vector (Regex_Option | Option))
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Compile the provided `expression` into a regex pattern that can be used
|
||||
for matching.
|
||||
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want
|
||||
to compile.
|
||||
- options: The options to configure the matching process with. These are
|
||||
merged with the specific `engine_opts`.
|
||||
|
||||
? Why Compile?
|
||||
While many regex engines are able to cache ad-hoc patterns, it is often
|
||||
useful to be able to manually retain a pattern that you have computed.
|
||||
This function exists so you can hold onto the resultant `Pattern`
|
||||
object, instead of immediately proceeding to match using it.
|
||||
|
||||
> Example
|
||||
Compile the regex `"^a$"` in multiline mode so it matches all lines
|
||||
consisting of a single "a".
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
|
||||
example_compile =
|
||||
expression = "^a$"
|
||||
options = [Regex_Option.Multiline]
|
||||
engine = Default_Engine.new
|
||||
engine.compile expression options
|
||||
compile : Text -> Vector (Regex_Option | Option) -> Pattern ! (Compile_Error | Invalid_Option)
|
||||
compile self expression options =
|
||||
all_options = options + self.engine_opts
|
||||
options_bitmask = from_enso_options all_options
|
||||
unicode_regex = UnicodeRegex.new
|
||||
|
||||
maybe_java_pattern = Panic.recover Any <|
|
||||
Java_Pattern.compile (unicode_regex.transform expression) options_bitmask
|
||||
|
||||
internal_pattern = maybe_java_pattern.map_error case _ of
|
||||
err : PatternSyntaxException -> Syntax_Error.Error ("The regex could not be compiled: " + err.getMessage)
|
||||
other -> other
|
||||
|
||||
Pattern.Value internal_pattern all_options self
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Escape the special characters in `expression` such that the result is a
|
||||
valid literal pattern for the original string.
|
||||
|
||||
Arguments:
|
||||
- expression: The expression to escape metacharacters in.
|
||||
|
||||
> Example
|
||||
Turn a literal string into a regex that matches that string exactly.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
|
||||
example_escape =
|
||||
literal_string = "\!\.|abcde"
|
||||
engine = Default_Engine.new
|
||||
engine.escape literal_string
|
||||
escape : Text -> Text
|
||||
escape self expression = Java_Pattern.quote expression
|
||||
|
||||
## The default implementation of the `Data.Text.Regex.Engine.Pattern` interface.
|
||||
type Pattern
|
||||
|
||||
## PRIVATE
|
||||
|
||||
The default pattern type for Enso, produced by the default regex engine.
|
||||
|
||||
Arguments:
|
||||
- internal_pattern: The internal representation of the compiled pattern.
|
||||
- options: The vector of options with which this pattern was built.
|
||||
- engine: A handle to the engine that built this pattern.
|
||||
Value (internal_pattern : Java_Pattern) (options : Vector (Regex_Option | Option)) (engine : Default_Engine)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Constructs an internal matcher, settings the region as provided and
|
||||
handling some additional options.
|
||||
|
||||
Arguments:
|
||||
- input: The text on which it will be matching.
|
||||
- start: The start of the matcher's region.
|
||||
- end: The end of the matcher's region.
|
||||
|
||||
! Unicode Normalization
|
||||
The Regex engine used here handles string modifiers, like accents in a
|
||||
weird way. The string "s\u{301}" will be treated as containing "s"
|
||||
within it, but "ś" (which is canonically equivalent to the former one)
|
||||
will not contain "s". To get consistent behavior that does not depend
|
||||
on the encoding, we normalize all input.
|
||||
build_matcher : Text -> Integer -> Integer -> Java_Matcher
|
||||
build_matcher self input start end =
|
||||
## TODO [RW] Normalization had to be disabled - since start and end are
|
||||
in code unit space, normalization could shift these indices!
|
||||
This should be addressed when reviewing
|
||||
See: https://www.pivotaltracker.com/story/show/181524498
|
||||
#normalized_input = if self.options.contains Regex_Option.Ascii_Matching then input else
|
||||
# Text_Utils.normalize input
|
||||
normalized_input = input
|
||||
internal_matcher = self.internal_pattern.matcher normalized_input . region start end
|
||||
|
||||
if self.options.contains Option.No_Anchoring_Bounds then
|
||||
internal_matcher.useAnchoringBounds False
|
||||
if self.options.contains Option.Transparent_Bounds then
|
||||
internal_matcher.useTransparentBounds True
|
||||
|
||||
internal_matcher
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
- mode: The matching mode to use.
|
||||
|
||||
This method will _always_ return `Nothing` if it fails to match.
|
||||
|
||||
? Return Type
|
||||
When asked to match in a mode that can only provide a single match, the
|
||||
return type is either a single `Match` object. When asked to match in a
|
||||
mode that permits multiple matches, it will always return a `Vector`,
|
||||
even if only a single match is found.
|
||||
|
||||
> Example
|
||||
Match the first instance of the pattern `".."` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "abcdefghij"
|
||||
pattern.match input mode=Matching_Mode.First
|
||||
|
||||
> Example
|
||||
Match up to the first 3 instances of the pattern `".."` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "abcdefghij"
|
||||
pattern.match input mode=3
|
||||
|
||||
> Example
|
||||
Match all instances of the pattern `".."` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "abcdefghij"
|
||||
pattern.match input
|
||||
|
||||
> Example
|
||||
Check if the pattern `".*"` matches on the entire input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".*" []
|
||||
input = "abcdefghij"
|
||||
pattern.match input mode=Regex_Mode.Full
|
||||
match : Text -> (Regex_Mode | Matching_Mode) -> Match | Vector Match | Nothing
|
||||
match self input mode=Regex_Mode.All =
|
||||
do_match_mode mode start end = case mode of
|
||||
Matching_Mode.First ->
|
||||
internal_matcher = self.build_matcher input start end
|
||||
|
||||
if internal_matcher . find start . not then Nothing else
|
||||
Match.Value internal_matcher start end input
|
||||
_ : Integer ->
|
||||
if mode < 0 then Panic.throw <|
|
||||
Mode_Error.Error "Cannot match a negative number of times."
|
||||
|
||||
builder = Vector.new_builder
|
||||
|
||||
go : Integer -> Integer -> Nothing
|
||||
go offset remaining_count =
|
||||
should_continue = remaining_count > 0
|
||||
if should_continue.not || (offset >= end) then Nothing else
|
||||
internal_matcher = self.build_matcher input start end
|
||||
found = internal_matcher.find offset
|
||||
|
||||
if found.not then Nothing else
|
||||
builder.append (Match.Value internal_matcher start end input)
|
||||
match_end = internal_matcher.end 0
|
||||
# Ensure progress even if the match is an empty string.
|
||||
new_offset = if match_end > offset then match_end else offset+1
|
||||
@Tail_Call go new_offset remaining_count-1
|
||||
|
||||
go start mode
|
||||
vector = builder.to_vector
|
||||
|
||||
if vector.is_empty then Nothing else vector
|
||||
Regex_Mode.All ->
|
||||
builder = Vector.new_builder
|
||||
|
||||
go : Integer -> Nothing
|
||||
go offset =
|
||||
if offset >= end then Nothing else
|
||||
internal_matcher = self.build_matcher input start end
|
||||
found = internal_matcher.find offset
|
||||
|
||||
if found.not then Nothing else
|
||||
builder.append (Match.Value internal_matcher start end input)
|
||||
match_end = internal_matcher.end 0
|
||||
# Ensure progress even if the match is an empty string.
|
||||
new_offset = if match_end > offset then match_end else offset+1
|
||||
@Tail_Call go new_offset
|
||||
|
||||
go start
|
||||
vector = builder.to_vector
|
||||
|
||||
if vector.is_empty then Nothing else vector
|
||||
Regex_Mode.Full ->
|
||||
internal_matcher = self.build_matcher input start end
|
||||
if internal_matcher.matches.not then Nothing else
|
||||
Match.Value internal_matcher start end input
|
||||
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
|
||||
Mode_Error.Error "Modes cannot be recursive."
|
||||
|
||||
case mode of
|
||||
Regex_Mode.Bounded start end sub_mode ->
|
||||
if start < end then do_match_mode sub_mode start end else
|
||||
Panic.throw Invalid_Bounds_Error
|
||||
_ -> do_match_mode mode 0 (Text_Utils.char_length input)
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Returns `True` if the input matches against the pattern described by
|
||||
`self`, otherwise `False`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to check for matching.
|
||||
|
||||
> Example
|
||||
Check if the input "aa" matches against the pattern `".."`.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "aa"
|
||||
pattern.matches input
|
||||
matches : Text -> Boolean
|
||||
matches self input = case self.match input mode=Regex_Mode.Full of
|
||||
_ : Match -> True
|
||||
_ : Vector -> True
|
||||
_ -> False
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Tries to find the text in the `input` that matches against the pattern
|
||||
`self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to find matches in.
|
||||
- mode: The matching mode to use.
|
||||
|
||||
This method will _always_ return `Nothing` if it fails to find any
|
||||
matches.
|
||||
|
||||
? Return Type
|
||||
When asked to match in a mode that can only provide a single match, the
|
||||
return type is either a single `Match` object. When asked to match in a
|
||||
mode that permits multiple matches, it will always return a `Vector`,
|
||||
even if only a single match is found.
|
||||
|
||||
> Example
|
||||
Find the first instance of the pattern `".."` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "abcdefghij"
|
||||
pattern.find input mode=Matching_Mode.First
|
||||
|
||||
> Example
|
||||
Find up to the first 3 instances of the pattern `".."` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "abcdefghij"
|
||||
pattern.find input mode=3
|
||||
|
||||
> Example
|
||||
Find all instances of the pattern `".."` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "abcdefghij"
|
||||
pattern.find input
|
||||
|
||||
> Example
|
||||
Find if the pattern `".*"` matches on the entire input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".*" []
|
||||
input = "abcdefghij"
|
||||
pattern.find input mode=Regex_Mode.Full
|
||||
find : Text -> (Regex_Mode | Matching_Mode) -> Text | Vector Text | Nothing
|
||||
find self input mode=Regex_Mode.All =
|
||||
matches = self.match input mode
|
||||
case matches of
|
||||
_ : Match -> matches.group 0
|
||||
_ : Vector -> matches.map (_.group 0)
|
||||
_ -> matches
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Splits the `input` text based on the pattern described by `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to splut based on the pattern described by `self`.
|
||||
- mode: The splitting mode to use.
|
||||
|
||||
This method will _always_ return a vector. If no splits take place, the
|
||||
vector will contain a single element.
|
||||
|
||||
> Example
|
||||
Split the input on the first instance of the pattern `"aa"`.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile "aa" []
|
||||
input = "abaaabbaabba"
|
||||
pattern.match input mode=Matching_Mode.First
|
||||
|
||||
> Example
|
||||
Split on up to the first 3 instances of the pattern `"a"` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile "a" []
|
||||
input = "bacadaeaf"
|
||||
pattern.match input mode=3
|
||||
|
||||
> Example
|
||||
Split on all all instances of the pattern `"a"` in the input.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile "a" []
|
||||
input = "bacadaeaf"
|
||||
pattern.match input
|
||||
split : Text -> Matching_Mode | Regex_Mode | Integer -> Vector Text
|
||||
split self input mode=Regex_Mode.All =
|
||||
# Java uses this to mean the max length of the resulting array, so we
|
||||
# add 1.
|
||||
limit = case mode of
|
||||
Matching_Mode.First -> 2
|
||||
_ : Integer ->
|
||||
if mode < 0 then Panic.throw <|
|
||||
Mode_Error.Error "Cannot match a negative number of times."
|
||||
|
||||
mode + 1
|
||||
Regex_Mode.All -> -1
|
||||
Regex_Mode.Full -> Panic.throw <|
|
||||
Mode_Error.Error "Splitting on a full match yields an empty text."
|
||||
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
|
||||
Mode_Error.Error "Splitting on a bounded region is not well-defined."
|
||||
Matching_Mode.Last -> Panic.throw <|
|
||||
Mode_Error.Error "Splitting on the last match is not supported."
|
||||
|
||||
splits = self.internal_pattern.split input limit
|
||||
Vector.from_polyglot_array splits
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Replace all occurrences of the pattern described by `self` in the `input`
|
||||
with the specified `replacement`.
|
||||
|
||||
Arguments:
|
||||
- input: The text in which to perform the replacement(s).
|
||||
- replacement: The literal text with which to replace any matches.
|
||||
- mode: The matching mode to use for finding candidates to replace.
|
||||
|
||||
If this method performs no replacements it will return the `input` text
|
||||
unchanged.
|
||||
|
||||
> Example
|
||||
Replace the first occurrence of the pattern `".."` in the input with
|
||||
the text `"REPLACED"`.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile ".." []
|
||||
input = "abcdefghij"
|
||||
pattern.replace input "REPLACED" mode=Matching_Mode.First
|
||||
|
||||
> Example
|
||||
Replace up to the first 3 instances of the pattern `"aa"` in the input
|
||||
with the text `"REPLACED"`.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile "aa" []
|
||||
input = "aabbaaaabb"
|
||||
pattern.replace input "REPLACED" mode=3
|
||||
|
||||
> Example
|
||||
Replace all instances of the pattern `"aa"` in the input with the text
|
||||
`"REPLACED"`.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default
|
||||
|
||||
example_match =
|
||||
engine = Default.new
|
||||
pattern = engine.compile "aa []
|
||||
input = "aabbaabbbbbaab"
|
||||
pattern.replace input "REPLACED"
|
||||
replace : Text -> Text -> Regex_Mode | Matching_Mode | Integer -> Text
|
||||
replace self input replacement mode=Regex_Mode.All =
|
||||
do_replace_mode mode start end = case mode of
|
||||
Matching_Mode.First ->
|
||||
internal_matcher = self.build_matcher input start end
|
||||
internal_matcher.replaceFirst replacement
|
||||
_ : Integer ->
|
||||
if mode < 0 then Panic.throw <|
|
||||
Mode_Error.Error "Cannot replace a negative number of times."
|
||||
|
||||
internal_matcher = self.build_matcher input start end
|
||||
buffer = StringBuffer.new
|
||||
|
||||
go remaining_replacements =
|
||||
if (internal_matcher.find) && (remaining_replacements > 0) then
|
||||
internal_matcher.appendReplacement buffer replacement
|
||||
@Tail_Call go (remaining_replacements - 1)
|
||||
|
||||
go mode
|
||||
internal_matcher.appendTail buffer
|
||||
buffer.to_text
|
||||
Regex_Mode.All ->
|
||||
internal_matcher = self.build_matcher input start end
|
||||
internal_matcher.replaceAll replacement
|
||||
Regex_Mode.Full ->
|
||||
case self.match input mode=Regex_Mode.Full of
|
||||
_ : Match -> self.replace input replacement Matching_Mode.First
|
||||
Nothing -> input
|
||||
Matching_Mode.Last ->
|
||||
all_matches = self.match input
|
||||
all_matches_count = if all_matches.is_nothing then 0 else all_matches.length
|
||||
|
||||
if all_matches_count == 0 then input else
|
||||
internal_matcher = self.build_matcher input start end
|
||||
buffer = StringBuffer.new
|
||||
last_match_index = all_matches_count - 1
|
||||
|
||||
go match_index =
|
||||
internal_matcher.find
|
||||
case match_index == last_match_index of
|
||||
True -> internal_matcher.appendReplacement buffer replacement
|
||||
False -> @Tail_Call go (match_index + 1)
|
||||
|
||||
go 0
|
||||
internal_matcher.appendTail buffer
|
||||
buffer.to_text
|
||||
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
|
||||
Mode_Error.Error "Modes cannot be recursive."
|
||||
|
||||
case mode of
|
||||
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
|
||||
Mode_Error.Error "Bounded replacements are not well-formed."
|
||||
_ -> do_replace_mode mode 0 (Text_Utils.char_length input)
|
||||
|
||||
## The default implementation of the `Data.Text.Regex.Engine.Match` interface.
|
||||
type Match
|
||||
|
||||
## PRIVATE
|
||||
|
||||
A representation of a regular expression match.
|
||||
|
||||
Arguments:
|
||||
- internal_match: The internal representation of the regular expression
|
||||
match.
|
||||
- region_start: The start of the region over which the match was made.
|
||||
- region_end: The end of the region over which the match was made.
|
||||
- input: The input text that was being matched.
|
||||
Value (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text)
|
||||
|
||||
## Gets the text matched by the group with the provided identifier, or
|
||||
`Nothing` if the group did not participate in the match. If no such group
|
||||
exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- id: The index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
> Example
|
||||
Get the text of the group with the index 0.
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_group =
|
||||
match = Examples.match
|
||||
match.group 0
|
||||
|
||||
> Example
|
||||
Get the text of the group with the name "letters".
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_group =
|
||||
match = Examples.match
|
||||
match.group "letters"
|
||||
group : Integer | Text -> Text | Nothing ! No_Such_Group
|
||||
group self id =
|
||||
Panic.recover Any (self.internal_match.group id) . map_error (handle_error _ id)
|
||||
|
||||
## Gets a vector containing the results of _all_ of the capturing groups in
|
||||
the pattern, replacing the value of groups that did not participate in
|
||||
the match with `default`.
|
||||
|
||||
Arguments:
|
||||
- default: The value to return for a given index when the group at that
|
||||
index did not participate in the match.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
> Example
|
||||
Get a vector of the text matched by all of the groups in this match,
|
||||
replacing the value for groups that didn't match with "UNMATCHED".
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_groups =
|
||||
match = Examples.match
|
||||
match.groups default="UNMATCHED"
|
||||
groups : Any -> Vector (Text | Any)
|
||||
groups self default=Nothing =
|
||||
group_numbers = 0.up_to self.internal_match.groupCount+1
|
||||
group_numbers.map n->
|
||||
case self.group n of
|
||||
Nothing -> default
|
||||
a -> a
|
||||
|
||||
## Gets a map containing the named capturing groups for the pattern,
|
||||
replacing the value for groups that did not participate in the match with
|
||||
`default`.
|
||||
|
||||
Arguments:
|
||||
- default: The value to return for a given name when the group at that
|
||||
index did not participate in the match.
|
||||
|
||||
> Example
|
||||
Get the map of all of the named groups in this match, replacing the
|
||||
value for groups that didn't match with "UNMATCHED".
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_groups =
|
||||
match = Examples.match
|
||||
matcg.named_groups default="UNMATCHED"
|
||||
named_groups : Any -> Map Text (Text | Any)
|
||||
named_groups self default=Nothing =
|
||||
group_names = Vector.from_polyglot_array <|
|
||||
Regex_Utils.get_group_names self.internal_match.pattern
|
||||
pairs = group_names.map name->
|
||||
value = case self.group name of
|
||||
Nothing -> default
|
||||
a -> a
|
||||
[name, value]
|
||||
Map.from_vector pairs
|
||||
|
||||
## Gets the index of the first character captured by the group with the
|
||||
given identifier, or `Nothing` if the group did not participate in the
|
||||
match.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier for the group to fetch the start index for.
|
||||
|
||||
! What is a Character?
|
||||
This regular expression engine defines a "character" to mean a UTF-16
|
||||
character. This means that these indices should only be used with the
|
||||
result of calling `.char_vector` on the text. Using them with
|
||||
`.characters` or `.codepoints` will produce incorrect results.
|
||||
|
||||
> Example
|
||||
Get the start index in the input where the full pattern matched for
|
||||
this match.
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_start =
|
||||
match = Examples.match
|
||||
match.start 0
|
||||
start : Integer | Text -> Integer | Nothing ! No_Such_Group
|
||||
start self id =
|
||||
result = Panic.recover Any (self.internal_match.start id)
|
||||
no_errors = result.map_error (handle_error _ id)
|
||||
if no_errors == -1 then Nothing else no_errors
|
||||
|
||||
## Gets the index of the first character after `start` that was not captured
|
||||
by the group with the given identifier, or `Nothing` if the group did not
|
||||
participate in the match.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier for the group to fetch the end index for.
|
||||
|
||||
! What is a Character?
|
||||
This regular expression engine defines a "character" to mean a UTF-16
|
||||
character. This means that these indices should only be used with the
|
||||
result of calling `.char_vector` on the text. Using them with
|
||||
`.characters` or `.codepoints` will produce incorrect results.
|
||||
|
||||
> Example
|
||||
Get the end index in the input where the full pattern matched for this
|
||||
match.
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_end =
|
||||
match = Examples.match
|
||||
match.end 0
|
||||
end : Integer | Text -> Integer | Nothing ! No_Such_Group
|
||||
end self id =
|
||||
result = Panic.recover Any (self.internal_match.end id)
|
||||
no_errors = result.map_error (handle_error _ id)
|
||||
if no_errors == -1 then Nothing else no_errors
|
||||
|
||||
## Returns the span matched by the group with the provided identifier, or
|
||||
`Nothing` if the group did not participate in the match.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier for the group to fetch the end index for.
|
||||
|
||||
! What is a Character?
|
||||
This regular expression engine defines a "character" to mean a UTF-16
|
||||
character. This means that these indices should only be used with the
|
||||
result of calling `.char_vector` on the text. Using them with
|
||||
`.characters` or `.codepoints` will produce incorrect results.
|
||||
|
||||
> Example
|
||||
Get the span over the input that was matched by the full match.
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_Span =
|
||||
match = Examples.match
|
||||
match.span 0
|
||||
span : Integer | Text -> Utf_16_Span | Nothing ! No_Such_Group
|
||||
span self id = case self.group id of
|
||||
Nothing -> Nothing
|
||||
_ -> Utf_16_Span.Value ((self.start id).up_to (self.end id)) self.input
|
||||
|
||||
## Returns the start character index of the match's region.
|
||||
|
||||
! What is a Character?
|
||||
This regular expression engine defines a "character" to mean a UTF-16
|
||||
character. This means that these indices should only be used with the
|
||||
result of calling `.char_vector` on the text. Using them with
|
||||
`.characters` or `.codepoints` will produce incorrect results.
|
||||
|
||||
> Example
|
||||
Get the start position in the input to which this match was limited.
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_start_position =
|
||||
match = Examples.match
|
||||
match.start_position
|
||||
start_position : Integer
|
||||
start_position self = self.region_start
|
||||
|
||||
## Returns the end character index of the match's region.
|
||||
|
||||
! What is a Character?
|
||||
This regular expression engine defines a "character" to mean a UTF-16
|
||||
character. This means that these indices should only be used with the
|
||||
result of calling `.char_vector` on the text. Using them with
|
||||
`.characters` or `.codepoints` will produce incorrect results.
|
||||
|
||||
> Example
|
||||
Get the end position in the input to which this match was limited.
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_end_position =
|
||||
match = Examples.match
|
||||
match.end_position
|
||||
end_position : Integer
|
||||
end_position self = self.region_end
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Handle errors when looking up group info.
|
||||
|
||||
Arguments:
|
||||
- error: The error as a value.
|
||||
- id: The group identifier with which the error is associated.
|
||||
handle_error : Any -> (Text | Integer) -> Any
|
||||
handle_error error id = case error of
|
||||
_ : IndexOutOfBoundsException -> No_Such_Group.Error id
|
||||
_ : IllegalArgumentException -> No_Such_Group.Error id
|
||||
other -> other
|
||||
|
||||
## Options specific to the `Default` regular expression engine.
|
||||
type Option
|
||||
|
||||
## Specifies that the input expression to the pattern be treated as a
|
||||
sequence of literal characters. Metacharacters and escape sequences have
|
||||
no special meaning in this mode.
|
||||
Literal_Pattern
|
||||
|
||||
## Disables anchoring to the region's boundaries.
|
||||
|
||||
By default, the regex engine will allow `^` and `$` to match the
|
||||
boundaries of a restricted region. With this option specified, they will
|
||||
only match the start and end of the input.
|
||||
No_Anchoring_Bounds
|
||||
|
||||
## Enables transparent bounds.
|
||||
|
||||
Setting this option will allow the regex engine to look "through" the
|
||||
boundaries of the engine's region for the purposes of lookahead,
|
||||
lookbehind, and boundary matching.
|
||||
|
||||
Without this flag, the region boundaries are treated as opaque, meaning
|
||||
that the above constructs will fail to match anything outside the region.
|
||||
Transparent_Bounds
|
||||
|
||||
## Specifies that only the unix line ending `''\n'` be considered in the
|
||||
behaviour of the `^` and `$` special characters.
|
||||
Unix_Lines
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Generates a Java bitmask representing the options used to configure the
|
||||
regex.
|
||||
|
||||
Arguments:
|
||||
- opts: The enso-side options to configure the regex.
|
||||
from_enso_options : Vector (Option | Regex_Option) -> Integer
|
||||
from_enso_options opts =
|
||||
java_flags = Panic.recover Any <| opts.flat_map case _ of
|
||||
Option.Literal_Pattern -> [Java_Pattern.LITERAL]
|
||||
Option.Unix_Lines -> [Java_Pattern.UNIX_LINES]
|
||||
Option.No_Anchoring_Bounds -> []
|
||||
Option.Transparent_Bounds -> []
|
||||
Regex_Option.Case_Insensitive -> [Java_Pattern.CASE_INSENSITIVE]
|
||||
Regex_Option.Dot_Matches_Newline -> [Java_Pattern.DOTALL]
|
||||
Regex_Option.Multiline -> [Java_Pattern.MULTILINE]
|
||||
Regex_Option.Comments -> [Java_Pattern.COMMENTS]
|
||||
Regex_Option.Ascii_Matching -> []
|
||||
other -> Panic.throw (Invalid_Option.Error other)
|
||||
|
||||
options_bitmask = java_flags.fold 0 .bit_or
|
||||
|
||||
if opts.contains Regex_Option.Ascii_Matching then options_bitmask else
|
||||
unicode = [Java_Pattern.CANON_EQ, Java_Pattern.UNICODE_CASE, Java_Pattern.UNICODE_CHARACTER_CLASS].fold 0 .bit_or
|
||||
options_bitmask.bit_or unicode
|
@ -1,24 +1,143 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Range.Extensions
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Unimplemented.Unimplemented
|
||||
import project.Error.Error
|
||||
import project.Errors.Common.Index_Out_Of_Bounds
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
|
||||
## The `Data.Text.Regex.Engine.Match` interface.
|
||||
type Match
|
||||
## PRIVATE
|
||||
internal_regex_result : RegexResult (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (pattern : Pattern) (internal_regex_result : Any) (input : Text)
|
||||
|
||||
## PRIVATE
|
||||
Returns the start UTF16 character index of a group.
|
||||
|
||||
Gets the text matched by the group with the provided identifier, or
|
||||
`Nothing` if the group did not participate in the match. If no such group
|
||||
This method goes directly to the internal match object. It does not
|
||||
take group names, and does not have a default.
|
||||
|
||||
Arguments:
|
||||
- group: the integer group number.
|
||||
internal_start : Integer -> Integer
|
||||
internal_start self group = self.internal_regex_result.getStart group
|
||||
|
||||
## PRIVATE
|
||||
Returns the end UTF16 character index, plus one, of a group.
|
||||
|
||||
This method goes directly to the internal match object. It does not
|
||||
take group names, and does not have a default.
|
||||
|
||||
Arguments:
|
||||
- group: the integer group number.
|
||||
internal_end : Integer -> Integer
|
||||
internal_end self group = self.internal_regex_result.getEnd group
|
||||
|
||||
## Returns the start UTF16 character index of a group.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
utf_16_start : Integer | Text -> Integer
|
||||
utf_16_start self group=0 =
|
||||
span = self.utf_16_span group
|
||||
if span.is_nothing then Nothing else span.start
|
||||
|
||||
## Returns the end UTF16 character index, plus one, of a group.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
utf_16_end : Integer | Text -> Integer
|
||||
utf_16_end self group=0 =
|
||||
span = self.utf_16_span group
|
||||
if span.is_nothing then Nothing else span.end
|
||||
|
||||
## Returns the start grapheme index of a group.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
start : Integer | Text -> Integer
|
||||
start self group=0 =
|
||||
span = self.span group
|
||||
if span.is_nothing then Nothing else span.start
|
||||
|
||||
## Returns the end grapheme index, plus one, of a group.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
end : Integer | Text -> Integer
|
||||
end self group=0 =
|
||||
span = self.span group
|
||||
if span.is_nothing then Nothing else span.end
|
||||
|
||||
## Gets the UTF16 span matched by the group with the provided identifier, or
|
||||
a default value if the group did not participate in the match. If no such
|
||||
group exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- group: The integer index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match.utf_16_span will return the default value.
|
||||
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
|
||||
utf_16_span self group=0 ~default=Nothing =
|
||||
group_id = self.pattern.lookup_group group
|
||||
start = self.internal_start group_id
|
||||
end = self.internal_end group_id
|
||||
does_not_participate = start == -1 || end == -1
|
||||
if does_not_participate then default else
|
||||
range = Range.new start end
|
||||
Utf_16_Span.Value range self.input
|
||||
|
||||
## Gets the grapheme span matched by the group with the provided identifier, or
|
||||
a default value if the group did not participate in the match. If no such group
|
||||
exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- id: The index or name of that group.
|
||||
- group: The integer index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
@ -26,19 +145,62 @@ type Match
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
group : Integer | Text -> Text | Nothing ! No_Such_Group
|
||||
group self _ = Unimplemented.throw "This is an interface only."
|
||||
|
||||
## PRIVATE
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Gets a vector containing the results of _all_ of the capturing groups in
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match.span will return the default value.
|
||||
span : Integer | Text -> Any -> Span ! No_Such_Group
|
||||
span self group=0 ~default=Nothing =
|
||||
result = self.utf_16_span group Nothing
|
||||
if result.is_nothing then default else result.to_grapheme_span
|
||||
|
||||
## Gets the Text matched by the group with the provided identifier, or
|
||||
a default value if the group did not participate in the match. If no such
|
||||
group exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- group: The integer index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match.text will return the default value.
|
||||
text : Integer | Text -> Any -> Text ! No_Such_Group
|
||||
text self group=0 ~default=Nothing =
|
||||
result = self.span group Nothing
|
||||
if result.is_nothing then default else result.text
|
||||
|
||||
## Gets a vector containing the Text of _all_ of the capturing groups in
|
||||
the pattern, replacing the value of groups that did not participate in
|
||||
the match with `default`.
|
||||
the match with `default`. This vector includes group 0, which contains
|
||||
the entire match.
|
||||
|
||||
Arguments:
|
||||
- default: The value to return for a given index when the group at that
|
||||
index did not participate in the match. The default for this argument
|
||||
should be `Nothing`.
|
||||
index did not participate in the match.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
@ -46,60 +208,81 @@ type Match
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. `groups` will return the
|
||||
default value for groups that do not participate.
|
||||
|
||||
> Example
|
||||
Get a vector of the text matched by all of the groups in this match,
|
||||
replacing the value for groups that didn't match with "UNMATCHED".
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_groups =
|
||||
match = Examples.match
|
||||
match.groups default="UNMATCHED"
|
||||
groups : Any -> Vector (Text | Any)
|
||||
groups self _ = Unimplemented.throw "This is an interface only."
|
||||
groups self ~default=Nothing =
|
||||
group_numbers = 0.up_to self.pattern.group_count
|
||||
group_numbers.map n-> (self.text n . if_nothing default)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Gets a map containing the named capturing groups for the pattern,
|
||||
## Gets a map containing the named capturing groups for the pattern,
|
||||
replacing the value for groups that did not participate in the match with
|
||||
`default`.
|
||||
|
||||
Arguments:
|
||||
- default: The value to return for a given name when the group at that
|
||||
index did not participate in the match. This should default to
|
||||
`Nothing`.
|
||||
index did not participate in the match.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. `named_groups` will map
|
||||
a named group that does not participate to the default value.
|
||||
|
||||
> Example
|
||||
Get the map of all of the named groups in this match, replacing the
|
||||
value for groups that didn't participate in the match with "UNMATCHED".
|
||||
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
## match.named_groups.keys.sort == ["empty", "letters"]
|
||||
named_groups : Any -> Map Text (Text | Any)
|
||||
named_groups self _ = Unimplemented.throw "This is an interface only."
|
||||
named_groups self default=Nothing =
|
||||
named_group_names = self.pattern.group_names
|
||||
spans = named_group_names.map name-> self.text name default=default
|
||||
Map.from_vector (named_group_names.zip spans)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Gets the index of the first character captured by the group with the
|
||||
given identifier, or `Nothing` if the group did not participate in the
|
||||
match.
|
||||
## Gets the grapheme span matched by the group with the provided index, or
|
||||
a default value if the group did not participate in the match.
|
||||
If the identifier is invalid then `if_missing` is returned.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier for the group to fetch the start index for.
|
||||
start : Integer | Text -> Integer | Nothing ! No_Such_Group
|
||||
start self _ = Unimplemented.throw "This is an interface only."
|
||||
- id: The integer index or name of that group.
|
||||
- if_missing: The value to return if the index is out of bounds.
|
||||
get : Integer -> Any -> Text | Any
|
||||
get self index ~if_missing=Nothing =
|
||||
self.text index . catch No_Such_Group (_-> if_missing)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Gets the index of the first character after `start` that was not captured
|
||||
by the group with the given identifier, or `Nothing` if the group did not
|
||||
participate in the match.
|
||||
## Gets the grapheme span matched by the group with the provided index, or
|
||||
a default value if the group did not participate in the match.
|
||||
If the identifier is invalid then Index_Out_Of_Bounds is thrown.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier for the group to fetch the end index for.
|
||||
end : Integer | Text -> Integer | Nothing ! No_Such_Group
|
||||
end self _ = Unimplemented.throw "This is an intercace only."
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Returns the span matched by the group with the provided identifier, or
|
||||
`Nothing` if the group did not participate in the match.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier for the group to fetch the end index for.
|
||||
span : Integer | Text -> Span | Nothing ! No_Such_Group
|
||||
span self _ = Unimplemented.throw "This is an interface only."
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Returns the start character index of the match's region.
|
||||
start_position : Integer
|
||||
start_position self = Unimplemented.throw "This is an interface only."
|
||||
|
||||
## Returns the end character index of the match's region.
|
||||
end_position : Integer
|
||||
end_position self = Unimplemented.throw "This is an interface only."
|
||||
- id: The integer index or name of that group.
|
||||
- if_missing: The value to return if the index is out of bounds.
|
||||
at : Integer -> Text ! Index_Out_Of_Bounds
|
||||
at self index =
|
||||
self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count))
|
||||
|
@ -1,287 +0,0 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Extensions
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Regex_2.No_Such_Group
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Error.Error
|
||||
import project.Errors.Common.Index_Out_Of_Bounds
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
|
||||
type Match_2
|
||||
## internal_regex_result : RegexResult (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (pattern : Pattern_2) (internal_regex_result : Any) (input : Text)
|
||||
|
||||
## PRIVATE
|
||||
Returns the start UTF16 character index of a group.
|
||||
|
||||
This method goes directly to the internal match object. It does not
|
||||
take group names, and does not have a default.
|
||||
|
||||
Arguments:
|
||||
- group: the integer group number.
|
||||
internal_start : Integer -> Integer
|
||||
internal_start self group = self.internal_regex_result.getStart group
|
||||
|
||||
## PRIVATE
|
||||
Returns the end UTF16 character index, plus one, of a group.
|
||||
|
||||
This method goes directly to the internal match object. It does not
|
||||
take group names, and does not have a default.
|
||||
|
||||
Arguments:
|
||||
- group: the integer group number.
|
||||
internal_end : Integer -> Integer
|
||||
internal_end self group = self.internal_regex_result.getEnd group
|
||||
|
||||
## Returns the start UTF16 character index of a group.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
utf_16_start : Integer | Text -> Integer
|
||||
utf_16_start self group=0 =
|
||||
span = self.utf_16_span group
|
||||
if span.is_nothing then Nothing else span.start
|
||||
|
||||
## Returns the end UTF16 character index, plus one, of a group.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
utf_16_end : Integer | Text -> Integer
|
||||
utf_16_end self group=0 =
|
||||
span = self.utf_16_span group
|
||||
if span.is_nothing then Nothing else span.end
|
||||
|
||||
## Returns the start grapheme index of a group.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
start : Integer | Text -> Integer
|
||||
start self group=0 =
|
||||
span = self.span group
|
||||
if span.is_nothing then Nothing else span.start
|
||||
|
||||
## Returns the end grapheme index, plus one, of a group.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Arguments:
|
||||
- group: the group name or number. Marked groups defined in the regex are
|
||||
numbered starting at 1; group 0 refers to the entire match.
|
||||
end : Integer | Text -> Integer
|
||||
end self group=0 =
|
||||
span = self.span group
|
||||
if span.is_nothing then Nothing else span.end
|
||||
|
||||
## Gets the UTF16 span matched by the group with the provided identifier, or
|
||||
a default value if the group did not participate in the match. If no such
|
||||
group exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- group: The integer index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match_2.utf_16_span will return the default value.
|
||||
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
|
||||
utf_16_span self group=0 ~default=Nothing =
|
||||
group_id = self.pattern.lookup_group group
|
||||
start = self.internal_start group_id
|
||||
end = self.internal_end group_id
|
||||
does_not_participate = start == -1 || end == -1
|
||||
if does_not_participate then default else
|
||||
range = Range.new start end
|
||||
Utf_16_Span.Value range self.input
|
||||
|
||||
## Gets the grapheme span matched by the group with the provided identifier, or
|
||||
a default value if the group did not participate in the match. If no such group
|
||||
exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- group: The integer index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
! What is a Character?
|
||||
A character is defined as an Extended Grapheme Cluster, see Unicode
|
||||
Standard Annex 29. This is the smallest unit that still has semantic
|
||||
meaning in most text-processing applications.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match_2.span will return the default value.
|
||||
span : Integer | Text -> Any -> Span ! No_Such_Group
|
||||
span self group=0 ~default=Nothing =
|
||||
result = self.utf_16_span group Nothing
|
||||
if result.is_nothing then default else result.to_grapheme_span
|
||||
|
||||
## Gets the Text matched by the group with the provided identifier, or
|
||||
a default value if the group did not participate in the match. If no such
|
||||
group exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- group: The integer index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match_2.text will return the default value.
|
||||
text : Integer | Text -> Any -> Text ! No_Such_Group
|
||||
text self group=0 ~default=Nothing =
|
||||
result = self.span group Nothing
|
||||
if result.is_nothing then default else result.text
|
||||
|
||||
## Gets a vector containing the Text of _all_ of the capturing groups in
|
||||
the pattern, replacing the value of groups that did not participate in
|
||||
the match with `default`. This vector includes group 0, which contains
|
||||
the entire match.
|
||||
|
||||
Arguments:
|
||||
- default: The value to return for a given index when the group at that
|
||||
index did not participate in the match.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. `groups` will return the
|
||||
default value for groups that do not participate.
|
||||
|
||||
> Example
|
||||
Get a vector of the text matched by all of the groups in this match,
|
||||
replacing the value for groups that didn't match with "UNMATCHED".
|
||||
|
||||
import Standard.Examples
|
||||
|
||||
example_groups =
|
||||
match = Examples.match
|
||||
match.groups default="UNMATCHED"
|
||||
groups : Any -> Vector (Text | Any)
|
||||
groups self ~default=Nothing =
|
||||
group_numbers = 0.up_to self.pattern.group_count
|
||||
group_numbers.map n-> (self.text n . if_nothing default)
|
||||
|
||||
## Gets a map containing the named capturing groups for the pattern,
|
||||
replacing the value for groups that did not participate in the match with
|
||||
`default`.
|
||||
|
||||
Arguments:
|
||||
- default: The value to return for a given name when the group at that
|
||||
index did not participate in the match.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. `named_groups` will map
|
||||
a named group that does not participate to the default value.
|
||||
|
||||
> Example
|
||||
Get the map of all of the named groups in this match, replacing the
|
||||
value for groups that didn't participate in the match with "UNMATCHED".
|
||||
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
## match.named_groups.keys.sort == ["empty", "letters"]
|
||||
named_groups : Any -> Map Text (Text | Any)
|
||||
named_groups self default=Nothing =
|
||||
named_group_names = self.pattern.group_names
|
||||
spans = named_group_names.map name-> self.text name default=default
|
||||
Map.from_vector (named_group_names.zip spans)
|
||||
|
||||
## Gets the grapheme span matched by the group with the provided index, or
|
||||
a default value if the group did not participate in the match.
|
||||
If the identifier is invalid then `if_missing` is returned.
|
||||
|
||||
Arguments:
|
||||
- id: The integer index or name of that group.
|
||||
- if_missing: The value to return if the index is out of bounds.
|
||||
get : Integer -> Any -> Text | Any
|
||||
get self index ~if_missing=Nothing =
|
||||
self.text index . catch No_Such_Group (_-> if_missing)
|
||||
|
||||
## Gets the grapheme span matched by the group with the provided index, or
|
||||
a default value if the group did not participate in the match.
|
||||
If the identifier is invalid then Index_Out_Of_Bounds is thrown.
|
||||
|
||||
Arguments:
|
||||
- id: The integer index or name of that group.
|
||||
- if_missing: The value to return if the index is out of bounds.
|
||||
at : Integer -> Text ! Index_Out_Of_Bounds
|
||||
at self index =
|
||||
self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count))
|
@ -1,78 +1,183 @@
|
||||
import project.Data.Boolean.Boolean
|
||||
import project.Any.Any
|
||||
import project.Data.Filter_Condition.Filter_Condition
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Range.Extensions
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Helpers
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Regex.Replacer.Replacer
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Unimplemented.Unimplemented
|
||||
import project.Errors.Common.Type_Error
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Polyglot.Polyglot
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Data.Index_Sub_Range import sort_and_merge_ranges
|
||||
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
|
||||
## The `Data.Text.Regex.Engine.Pattern` interface.
|
||||
type Pattern
|
||||
## internal_regex_object : RegexObject (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (internal_regex_object : Any)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
- mode: The matching mode to use. This must default to `Regex_Mode.All`.
|
||||
|
||||
This method will _always_ return `Nothing` if it fails to match.
|
||||
|
||||
? Return Type
|
||||
When asked to match in a mode that can only provide a single match, the
|
||||
return type is either a single `Match` object. When asked to match in a
|
||||
mode that permits multiple matches, it will always return a `Vector`,
|
||||
even if only a single match is found.
|
||||
match : Text -> (Regex_Mode | Matching_Mode) -> Match | Vector Match | Nothing
|
||||
match self _ _ = Unimplemented.throw "This is an interface only."
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Returns `True` if the input matches against the pattern described by
|
||||
## Returns `True` if the input matches against the pattern described by
|
||||
`self`, otherwise `False`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to check for matching.
|
||||
matches : Text -> Boolean
|
||||
matches self _ = Unimplemented.throw "This is an interface only."
|
||||
matches : Text -> Boolean | Type_Error
|
||||
matches self input =
|
||||
Helpers.expect_text input <|
|
||||
m = self.internal_regex_object.exec input 0
|
||||
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
|
||||
|
||||
## PRIVATE
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Tries to find the text in the `input` that matches against the pattern
|
||||
`self`.
|
||||
Returns a `Match` containing the matched text and its match groups, or
|
||||
`Nothing` if the match failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to find matches in.
|
||||
- mode: The matching mode to use. This must default to `Regex_Mode.All`
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match : Text -> Match | Nothing | Type_Error
|
||||
match self input =
|
||||
Helpers.expect_text input <|
|
||||
it = Match_Iterator.new self input
|
||||
case it.next of
|
||||
Match_Iterator_Value.Next _ match _ -> match
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
|
||||
This method will _always_ return `Nothing` if it fails to find any
|
||||
matches.
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
? Return Type
|
||||
When asked to match in a mode that can only provide a single match, the
|
||||
return type is either a single `Match` object. When asked to match in a
|
||||
mode that permits multiple matches, it will always return a `Vector`,
|
||||
even if only a single match is found.
|
||||
find : Text -> (Regex_Mode | Matching_Mode) -> Text | Vector Text | Nothing
|
||||
find self _ _ = Unimplemented.throw "This is an interface only."
|
||||
Returns a `Vector Match` object, each containing the matched text
|
||||
and its match groups.
|
||||
|
||||
## PRIVATE
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match_all : Text -> Vector Match ! Type_Error
|
||||
match_all self input =
|
||||
Helpers.expect_text input <|
|
||||
pattern_is_empty = self.internal_regex_object.pattern == ''
|
||||
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next _ match next_it ->
|
||||
builder.append match
|
||||
@Tail_Call go next_it
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
go it
|
||||
builder.to_vector
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Text` containing the matched text, or `Nothing` if the match
|
||||
failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find : Text -> Text | Nothing | Type_Error
|
||||
find self input =
|
||||
Helpers.expect_text input <|
|
||||
match_to_group_maybe <| self.match input
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Text`, each containing the matched text.
|
||||
If the pattern does not match, an empty `Vector` is returned.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find_all : Text -> Vector Text | Type_Error
|
||||
find_all self input =
|
||||
Helpers.expect_text input <|
|
||||
self.match_all input . map match_to_group_maybe
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Splits the `input` text based on the pattern described by `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to split based on the pattern described by `self`.
|
||||
- mode: The splitting mode to use. This must default to `Regex_Mode.All`.
|
||||
- only_first: If true, only split at the first occurrence.
|
||||
|
||||
This method will _always_ return a vector. If no splits take place, the
|
||||
vector will contain a single element.
|
||||
split : Text -> (Matching_Mode | Integer | Regex_Mode) -> Vector Text
|
||||
split self _ _ = Unimplemented.throw "This is an interface only."
|
||||
vector will contain a single element (equal to the original string).
|
||||
|
||||
## PRIVATE
|
||||
> Example
|
||||
Split on the first instance of the pattern.
|
||||
pattern = Regex.compile "cd"
|
||||
input = "abcdefcdghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["ab", "efcdghij"]
|
||||
|
||||
> Example
|
||||
Split on the all instances of the pattern in the input.
|
||||
pattern = Regex.compile "a"
|
||||
input = "bacadaeaf"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["b", "c", "d", "e", "f"]
|
||||
|
||||
> Example
|
||||
Returns the original text if there are no matches.
|
||||
pattern = Regex.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["abcdefghij"]
|
||||
split : Text -> Boolean -> Vector Text | Type_Error
|
||||
split self input only_first=False =
|
||||
Helpers.expect_text input <|
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go next = case next of
|
||||
Match_Iterator_Value.Next filler _ next_it ->
|
||||
builder.append filler.text
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next
|
||||
Match_Iterator_Value.Last filler ->
|
||||
builder.append filler.text
|
||||
go it.next
|
||||
builder.to_vector
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Takes an input string and returns all the matches as a `Vector Text`.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned. Non-participating
|
||||
groups are omitted.
|
||||
|
||||
Arguments:
|
||||
- input: The text to tokenize.
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters.
|
||||
|
||||
Regex.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters taking first and third letters.
|
||||
|
||||
Regex.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
|
||||
|
||||
> Example
|
||||
Split a text on any white space.
|
||||
|
||||
Regex.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
|
||||
== ['Hello','Big','Wide','World','Goodbye!']
|
||||
tokenize : Text -> Vector Text
|
||||
tokenize self input =
|
||||
self.match_all input . map (build_tokenization_output_from_match self _)
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Replace all occurrences of the pattern described by `self` in the `input`
|
||||
with the specified `replacement`.
|
||||
@ -80,10 +185,250 @@ type Pattern
|
||||
Arguments:
|
||||
- input: The text in which to perform the replacement(s).
|
||||
- replacement: The literal text with which to replace any matches.
|
||||
- mode: The matching mode to use for finding candidates to replace. This
|
||||
must default to `Regex_Mode.All`.
|
||||
- only_first: If True, only replace the first match.
|
||||
|
||||
If this method performs no replacements it will return the `input` text
|
||||
unchanged.
|
||||
replace : Text -> Text -> Regex_Mode | Matching_Mode | Integer -> Text
|
||||
replace self _ _ _ = Unimplemented.throw "This is an interface only."
|
||||
|
||||
The replacement string can contain references to groups matched by the
|
||||
regex. The following syntaxes are supported:
|
||||
$0: the entire match string
|
||||
$&: the entire match string
|
||||
$n: the nth group
|
||||
$<foo>: Named group `foo`
|
||||
|
||||
> Example
|
||||
Replace letters in the text "aa".
|
||||
|
||||
pattern = Regex.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' == 'ba'
|
||||
|
||||
> Example
|
||||
Replace all occurrences of letters 'l' and 'o' with '#'.
|
||||
|
||||
pattern = Regex.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
|
||||
|
||||
> Example
|
||||
Replace the first occurrence of letter 'l' with '#'.
|
||||
|
||||
pattern = Regex.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
|
||||
|
||||
> Example
|
||||
Replace texts in quotes with parentheses.
|
||||
|
||||
pattern = Regex.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
|
||||
|
||||
> Example
|
||||
Replace a literal string with a replacement value.
|
||||
|
||||
pattern = Regex.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match == "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
> Example
|
||||
Replace each word with the same word surrounded by `[]`.
|
||||
|
||||
pattern = Regex.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
|
||||
replace : Text -> Text -> Boolean -> Text | Type_Error
|
||||
replace self input replacement only_first=False =
|
||||
Helpers.expect_text input <|
|
||||
it = Match_Iterator.new self input
|
||||
case it of
|
||||
Match_Iterator_Value.Last filler -> filler.text
|
||||
_ ->
|
||||
replacer = Replacer.new replacement self
|
||||
|
||||
replacer.if_not_error <|
|
||||
go next current = case next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
new_value = current + filler.text + (replacer.replace match)
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next new_value
|
||||
Match_Iterator_Value.Last filler ->
|
||||
current + filler.text
|
||||
go it.next ""
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Look up a match group name or number, and check that it is valid.
|
||||
|
||||
Arguments:
|
||||
- id: The name or number of the group that was asked for.
|
||||
|
||||
Returns: a group number.
|
||||
|
||||
A group number is invalid if it is outside the range of groups
|
||||
that were in the original pattern.
|
||||
|
||||
A group name is invalid if it was not defined in the original pattern.
|
||||
|
||||
A group name is an alias for a group number; if a name is passed to
|
||||
this method, it returns the corresponding group number.
|
||||
|
||||
If a group number is passed to `lookup_group` and it is valid, it will
|
||||
simply return the group number.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match.group will return Nothing.
|
||||
|
||||
lookup_group : Integer | Text -> Integer ! No_Such_Group
|
||||
lookup_group self id =
|
||||
case id of
|
||||
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
|
||||
True -> n
|
||||
False -> Error.throw (No_Such_Group.Error n)
|
||||
name : Text ->
|
||||
# Maps name to number
|
||||
groups = self.internal_regex_object.groups
|
||||
|
||||
n = case groups of
|
||||
# If Nothing, there are no named groups
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
_ ->
|
||||
qq = (read_group_map groups name)
|
||||
case qq of
|
||||
Nothing -> Nothing
|
||||
n : Integer -> n
|
||||
case n of
|
||||
_ : Integer -> n
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Return a lazy iterator over matches against a string.
|
||||
|
||||
Arguments
|
||||
- text: the string to match against.
|
||||
iterator : Text -> Match_Iterator
|
||||
iterator self input = Match_Iterator.new self input
|
||||
|
||||
## Return the number of groups in the underlying RegexObject.
|
||||
Note, the count includes group 0 (the whole match) as well.
|
||||
group_count : Integer
|
||||
group_count self = self.internal_regex_object.groupCount
|
||||
|
||||
## Return a vector of all named group names.
|
||||
group_names : Map Text Integer
|
||||
group_names self =
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.keys
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Performs the regex match, and iterates through the results. Yields both
|
||||
the matched parts of the string, and the 'filler' parts between them.
|
||||
|
||||
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
|
||||
matches and replacement boundaries can fall in the middle of multi-
|
||||
character graphemes, thereby splitting them apart.
|
||||
|
||||
At each step, it yields a Match_Iterator_Value, whivch has either a filler
|
||||
and a match, or just the final filler. A Match_Iterator_Value.Last value is
|
||||
return at the end, and only at the end.
|
||||
|
||||
Optionally, you can call `early_exit` to have it return the remainder of
|
||||
the string, unmatched, as a single Last value. (Used for `replace` with
|
||||
`only_first=True`.)
|
||||
type Match_Iterator
|
||||
new : Pattern -> Text -> Match_Iterator
|
||||
new pattern input = Match_Iterator.Value pattern input 0
|
||||
|
||||
Value (pattern : Pattern) (input : Text) (cursor : Integer)
|
||||
|
||||
## Return the next match, or the last filler string if there is no
|
||||
additional match.
|
||||
|
||||
Also returns the next iterator, if there was a match.
|
||||
next : Match_Iterator_Value
|
||||
next self =
|
||||
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
|
||||
case regex_result.isMatch of
|
||||
False ->
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
Match_Iterator_Value.Last filler_span
|
||||
True ->
|
||||
match_start = regex_result.getStart 0
|
||||
filler_range = Range.new self.cursor match_start
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
match = Match.Value self.pattern regex_result self.input
|
||||
next_cursor = match.utf_16_end 0
|
||||
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
|
||||
Match_Iterator_Value.Next filler_span match next_iterator
|
||||
|
||||
## Returns the remainder of the string, unmatched.
|
||||
early_exit : Match_Iterator_Value
|
||||
early_exit self =
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = Utf_16_Span.Value filler_range self.input
|
||||
Match_Iterator_Value.Last filler_span
|
||||
|
||||
to_text_debug : Vector Text
|
||||
to_text_debug self =
|
||||
vb = Vector.new_builder
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
vb.append ('\"' + filler.text + '\"')
|
||||
vb.append ("/" + (match.span 0).text + "/")
|
||||
go next_it
|
||||
Match_Iterator_Value.Last filler ->
|
||||
vb.append ('\"' + filler.text + '\"')
|
||||
go self
|
||||
vb.to_vector
|
||||
|
||||
## PRIVATE
|
||||
type Match_Iterator_Value
|
||||
Next (filler : Span) (match : Match) (next_iterator : Match_Iterator)
|
||||
Last (filler : Span)
|
||||
|
||||
## PRIVATE
|
||||
Convert the polyglot map to a Map.
|
||||
polyglot_map_to_map : Any -> Map Any Any
|
||||
polyglot_map_to_map map =
|
||||
polyglot_keys = Polyglot.get_members map
|
||||
keys = Vector.from_polyglot_array polyglot_keys
|
||||
pairs = keys.map key-> [key, Polyglot.get_member map key]
|
||||
Map.from_vector pairs
|
||||
|
||||
## PRIVATE
|
||||
Get the named group from the polyglot map.
|
||||
read_group_map : Any -> Text -> Integer | Nothing
|
||||
read_group_map polyglot_map name =
|
||||
map = polyglot_map_to_map polyglot_map
|
||||
map.get name
|
||||
|
||||
## PRIVATE
|
||||
match_to_group_maybe : Match | Nothing -> Text | Nothing
|
||||
match_to_group_maybe match =
|
||||
if match.is_nothing then Nothing else match.text 0
|
||||
|
||||
## PRIVATE
|
||||
Build an output string from a Match resulting from `tokenize`.
|
||||
See `tokenize`.
|
||||
build_tokenization_output_from_match : Pattern -> Match -> Text
|
||||
build_tokenization_output_from_match pattern match =
|
||||
if pattern.group_count == 1 then match.text 0 else
|
||||
# Extract the ranges of the spans of all capturing groups
|
||||
group_numbers = 1.up_to pattern.group_count
|
||||
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
|
||||
ranges = spans.map span-> case span of Span.Value range _ -> range
|
||||
|
||||
# Eliminate nested capturing groups by sorting and merging the ranges.
|
||||
top_level_ranges = sort_and_merge_ranges ranges
|
||||
|
||||
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
|
||||
text_all = case spans.at 0 of Span.Value _ text -> text
|
||||
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
|
||||
top_level_spans.map (.text) . join
|
||||
|
@ -1,425 +0,0 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Filter_Condition.Filter_Condition
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Extensions
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Regex.Match_2.Match_2
|
||||
import project.Data.Text.Regex.Replacer.Replacer
|
||||
import project.Data.Text.Regex_2.No_Such_Group
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Polyglot.Polyglot
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Data.Index_Sub_Range import sort_and_merge_ranges
|
||||
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
|
||||
type Pattern_2
|
||||
## internal_regex_object : RegexObject (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (internal_regex_object : Any)
|
||||
|
||||
## Returns `True` if the input matches against the pattern described by
|
||||
`self`, otherwise `False`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to check for matching.
|
||||
matches : Text -> Boolean
|
||||
matches self input =
|
||||
m = self.internal_regex_object.exec input 0
|
||||
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Match_2` containing the matched text and its match groups, or
|
||||
`Nothing` if the match failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match : Text -> Match_2 | Nothing
|
||||
match self input =
|
||||
it = Match_Iterator.new self input
|
||||
case it.next of
|
||||
Match_Iterator_Value.Next _ match _ -> match
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Match_2` object, each containing the matched text
|
||||
and its match groups.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match_all : Text -> Vector Match_2 ! Illegal_Argument
|
||||
match_all self input =
|
||||
pattern_is_empty = self.internal_regex_object.pattern == ''
|
||||
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next _ match next_it ->
|
||||
builder.append match
|
||||
@Tail_Call go next_it
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
go it
|
||||
builder.to_vector
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Text` containing the matched text, or `Nothing` if the match
|
||||
failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find : Text -> Text | Nothing
|
||||
find self input =
|
||||
match_to_group_maybe <| self.match input
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Text`, each containing the matched text.
|
||||
If the pattern does not match, an empty `Vector` is returned.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find_all : Text -> Vector Text
|
||||
find_all self input =
|
||||
self.match_all input . map match_to_group_maybe
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Splits the `input` text based on the pattern described by `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to split based on the pattern described by `self`.
|
||||
- only_first: If True, only split at the first occurrence.
|
||||
|
||||
This method will _always_ return a vector. If no splits take place, the
|
||||
vector will contain a single element (equal to the original string).
|
||||
|
||||
> Example
|
||||
Split on the first instance of the pattern.
|
||||
pattern = Regex_2.compile "cd"
|
||||
input = "abcdefcdghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["ab", "efcdghij"]
|
||||
|
||||
> Example
|
||||
Split on the all instances of the pattern in the input.
|
||||
pattern = Regex_2.compile "a"
|
||||
input = "bacadaeaf"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["b", "c", "d", "e", "f"]
|
||||
|
||||
> Example
|
||||
Returns the original text if there are no matches.
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["abcdefghij"]
|
||||
split : Text -> Boolean -> Vector Text
|
||||
split self input only_first=False =
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go next = case next of
|
||||
Match_Iterator_Value.Next filler _ next_it ->
|
||||
builder.append filler.text
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next
|
||||
Match_Iterator_Value.Last filler ->
|
||||
builder.append filler.text
|
||||
go it.next
|
||||
builder.to_vector
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Takes an input string and returns all the matches as a `Vector Text`.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned. Non-participating
|
||||
groups are omitted.
|
||||
|
||||
Arguments:
|
||||
- input: The text to tokenize.
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters.
|
||||
|
||||
Regex_2.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters taking first and third letters.
|
||||
|
||||
Regex_2.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
|
||||
|
||||
> Example
|
||||
Split a text on any white space.
|
||||
|
||||
Regex_2.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
|
||||
== ['Hello','Big','Wide','World','Goodbye!']
|
||||
tokenize : Text -> Vector Text
|
||||
tokenize self input =
|
||||
self.match_all input . map (build_tokenization_output_from_match self _)
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Replace all occurrences of the pattern described by `self` in the `input`
|
||||
with the specified `replacement`.
|
||||
|
||||
Arguments:
|
||||
- input: The text in which to perform the replacement(s).
|
||||
- replacement: The literal text with which to replace any matches.
|
||||
- only_first: If True, only replace the first match.
|
||||
|
||||
If this method performs no replacements it will return the `input` text
|
||||
unchanged.
|
||||
|
||||
The replacement string can contain references to groups matched by the
|
||||
regex. The following syntaxes are supported:
|
||||
$0: the entire match string
|
||||
$&: the entire match string
|
||||
$n: the nth group
|
||||
$<foo>: Named group `foo`
|
||||
|
||||
> Example
|
||||
Replace letters in the text "aa".
|
||||
|
||||
pattern = Regex_2.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' == 'ba'
|
||||
|
||||
> Example
|
||||
Replace all occurrences of letters 'l' and 'o' with '#'.
|
||||
|
||||
pattern = Regex_2.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
|
||||
|
||||
> Example
|
||||
Replace the first occurrence of letter 'l' with '#'.
|
||||
|
||||
pattern = Regex_2.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
|
||||
|
||||
> Example
|
||||
Replace texts in quotes with parentheses.
|
||||
|
||||
pattern = Regex_2.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
|
||||
|
||||
> Example
|
||||
Replace a literal string with a replacement value.
|
||||
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match == "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
> Example
|
||||
Replace each word with the same word surrounded by `[]`.
|
||||
|
||||
pattern = Regex_2.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
|
||||
replace : Text -> Text -> Boolean -> Text
|
||||
replace self input replacement only_first=False =
|
||||
it = Match_Iterator.new self input
|
||||
case it of
|
||||
Match_Iterator_Value.Last filler -> filler.text
|
||||
_ ->
|
||||
replacer = Replacer.new replacement self
|
||||
|
||||
replacer.if_not_error <|
|
||||
go next current = case next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
new_value = current + filler.text + (replacer.replace match)
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next new_value
|
||||
Match_Iterator_Value.Last filler ->
|
||||
current + filler.text
|
||||
go it.next ""
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Look up a match group name or number, and check that it is valid.
|
||||
|
||||
Arguments:
|
||||
- id: The name or number of the group that was asked for.
|
||||
|
||||
Returns: a group number.
|
||||
|
||||
A group number is invalid if it is outside the range of groups
|
||||
that were in the original pattern.
|
||||
|
||||
A group name is invalid if it was not defined in the original pattern.
|
||||
|
||||
A group name is an alias for a group number; if a name is passed to
|
||||
this method, it returns the corresponding group number.
|
||||
|
||||
If a group number is passed to `lookup_group` and it is valid, it will
|
||||
simply return the group number.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match_2.group will return Nothing.
|
||||
|
||||
lookup_group : Integer | Text -> Integer ! No_Such_Group
|
||||
lookup_group self id =
|
||||
case id of
|
||||
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
|
||||
True -> n
|
||||
False -> Error.throw (No_Such_Group.Error n)
|
||||
name : Text ->
|
||||
# Maps name to number
|
||||
groups = self.internal_regex_object.groups
|
||||
|
||||
n = case groups of
|
||||
# If Nothing, there are no named groups
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
_ ->
|
||||
qq = (read_group_map groups name)
|
||||
case qq of
|
||||
Nothing -> Nothing
|
||||
n : Integer -> n
|
||||
case n of
|
||||
_ : Integer -> n
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Return a lazy iterator over matches against a string.
|
||||
|
||||
Arguments
|
||||
- text: the string to match against.
|
||||
iterator : Text -> Match_Iterator
|
||||
iterator self input = Match_Iterator.new self input
|
||||
|
||||
## Return the number of groups in the underlying RegexObject.
|
||||
Note, the count includes group 0 (the whole match) as well.
|
||||
group_count : Integer
|
||||
group_count self = self.internal_regex_object.groupCount
|
||||
|
||||
## Return a vector of all named group names.
|
||||
group_names : Map Text Integer
|
||||
group_names self =
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.keys
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Performs the regex match, and iterates through the results. Yields both
|
||||
the matched parts of the string, and the 'filler' parts between them.
|
||||
|
||||
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
|
||||
matches and replacement boundaries can fall in the middle of multi-
|
||||
character graphemes, thereby splitting them apart.
|
||||
|
||||
At each step, it yields a Match_Iterator_Value, whivch has either a filler
|
||||
and a match, or just the final filler. A Match_Iterator_Value.Last value is
|
||||
return at the end, and only at the end.
|
||||
|
||||
Optionally, you can call `early_exit` to have it return the remainder of
|
||||
the string, unmatched, as a single Last value. (Used for `replace` with
|
||||
`only_first=True`.)
|
||||
type Match_Iterator
|
||||
new : Pattern_2 -> Text -> Match_Iterator
|
||||
new pattern input = Match_Iterator.Value pattern input 0
|
||||
|
||||
Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
|
||||
|
||||
## Return the next match, or the last filler string if there is no
|
||||
additional match.
|
||||
|
||||
Also returns the next iterator, if there was a match.
|
||||
next : Match_Iterator_Value
|
||||
next self =
|
||||
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
|
||||
case regex_result.isMatch of
|
||||
False ->
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
Match_Iterator_Value.Last filler_span
|
||||
True ->
|
||||
match_start = regex_result.getStart 0
|
||||
filler_range = Range.new self.cursor match_start
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
match = Match_2.Value self.pattern regex_result self.input
|
||||
next_cursor = match.utf_16_end 0
|
||||
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
|
||||
Match_Iterator_Value.Next filler_span match next_iterator
|
||||
|
||||
## Returns the remainder of the string, unmatched.
|
||||
early_exit : Match_Iterator_Value
|
||||
early_exit self =
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = Utf_16_Span.Value filler_range self.input
|
||||
Match_Iterator_Value.Last filler_span
|
||||
|
||||
to_text_debug : Vector Text
|
||||
to_text_debug self =
|
||||
vb = Vector.new_builder
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
vb.append ('\"' + filler.text + '\"')
|
||||
vb.append ("/" + (match.span 0).text + "/")
|
||||
go next_it
|
||||
Match_Iterator_Value.Last filler ->
|
||||
vb.append ('\"' + filler.text + '\"')
|
||||
go self
|
||||
vb.to_vector
|
||||
|
||||
## PRIVATE
|
||||
type Match_Iterator_Value
|
||||
Next (filler : Span) (match : Match_2) (next_iterator : Match_Iterator)
|
||||
Last (filler : Span)
|
||||
|
||||
## PRIVATE
|
||||
Convert the polyglot map to a Map.
|
||||
polyglot_map_to_map : Any -> Map Any Any
|
||||
polyglot_map_to_map map =
|
||||
polyglot_keys = Polyglot.get_members map
|
||||
keys = Vector.from_polyglot_array polyglot_keys
|
||||
pairs = keys.map key-> [key, Polyglot.get_member map key]
|
||||
Map.from_vector pairs
|
||||
|
||||
## PRIVATE
|
||||
Get the named group from the polyglot map.
|
||||
read_group_map : Any -> Text -> Integer | Nothing
|
||||
read_group_map polyglot_map name =
|
||||
map = polyglot_map_to_map polyglot_map
|
||||
map.get name
|
||||
|
||||
## PRIVATE
|
||||
match_to_group_maybe : Match_2 | Nothing -> Text | Nothing
|
||||
match_to_group_maybe match =
|
||||
if match.is_nothing then Nothing else match.text 0
|
||||
|
||||
## PRIVATE
|
||||
Build an output string from a Match_2 resulting from `tokenize`.
|
||||
See `tokenize`.
|
||||
build_tokenization_output_from_match : Pattern_2 -> Match_2 -> Text
|
||||
build_tokenization_output_from_match pattern match =
|
||||
if pattern.group_count == 1 then match.text 0 else
|
||||
# Extract the ranges of the spans of all capturing groups
|
||||
group_numbers = 1.up_to pattern.group_count
|
||||
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
|
||||
ranges = spans.map span-> case span of Span.Value range _ -> range
|
||||
|
||||
# Eliminate nested capturing groups by sorting and merging the ranges.
|
||||
top_level_ranges = sort_and_merge_ranges ranges
|
||||
|
||||
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
|
||||
text_all = case spans.at 0 of Span.Value _ text -> text
|
||||
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
|
||||
top_level_spans.map (.text) . join
|
@ -1,28 +0,0 @@
|
||||
## A description of how the regex engine will match on the content.
|
||||
|
||||
This lets you configure how you want to match, from the `First` match only,
|
||||
to matching on the `Full` content of the input text.
|
||||
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
|
||||
type Regex_Mode
|
||||
## The regex will make all possible matches.
|
||||
All
|
||||
|
||||
## The regex will only match if the _entire_ text matches.
|
||||
Full
|
||||
|
||||
## The regex will only match within the region defined by start..end.
|
||||
|
||||
Arguments:
|
||||
- start: The inclusive start bound of the region.
|
||||
- end: The exclusive end bound of the region.
|
||||
- mode: The mode to use within the bounded region.
|
||||
|
||||
! Units
|
||||
The `start` and `end` indices range over _characters_ in the text. The
|
||||
precise definition of `character` is, for the moment, defined by the
|
||||
regular expression engine itself.
|
||||
Bounded (start : Integer) (end : Integer) (mode : (Matching_Mode.First | Matching_Mode.Last | Regex_Mode) = Regex_Mode.All)
|
||||
|
@ -1,44 +0,0 @@
|
||||
## Options are used to configure how a regex engine behaves.
|
||||
|
||||
In this file, Enso provides a set of standard options that must be supported
|
||||
by all regex engines integrated with Enso.
|
||||
|
||||
type Regex_Option
|
||||
|
||||
## Specifies that all predefined character classes and POSIX character
|
||||
classes will match _only_ on ASCII characters.
|
||||
|
||||
! Performance
|
||||
If you are _sure_ that your data can only ever contain characters from
|
||||
the ASCII character set, you may be able to obtain a performance boost
|
||||
by specifying this flag. This may not be the case on all engines or all
|
||||
regexes.
|
||||
Ascii_Matching
|
||||
|
||||
## Specifies that matching should be performed in a case-insensitive manner.
|
||||
Case_Insensitive
|
||||
|
||||
## Specifies that the regular expression should be interpreted in comments
|
||||
mode.
|
||||
|
||||
Comments mode has the following changes:
|
||||
- Whitespace within the pattern is ignored, except when within a
|
||||
character class or when preceded by an unescaped backslash, or within
|
||||
grouping constructs (e.g. `(?...)`).
|
||||
- When a line contains a `#`, that is not in a character class and is not
|
||||
preceded by an unescaped backslash, all characters from the leftmost
|
||||
such `#` to the end of the line are ignored. That is to say, they act
|
||||
as _comments_ in the regex.
|
||||
Comments
|
||||
|
||||
## Specifies that the `.` special character should match everything
|
||||
_including_ newline characters. Without this flag, it will match all
|
||||
characters _except_ newlines.
|
||||
Dot_Matches_Newline
|
||||
|
||||
## Specifies that the pattern character `^` matches at both the beginning of
|
||||
the string and at the beginning of each line (immediately following a
|
||||
newline), and that the pattern character `$` matches at the end of each
|
||||
line _and_ at the end of the string.
|
||||
Multiline
|
||||
|
@ -1,10 +1,10 @@
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Extensions
|
||||
import project.Data.Text.Regex.Match_2.Match_2
|
||||
import project.Data.Text.Regex.Pattern_2.Match_Iterator_Value
|
||||
import project.Data.Text.Regex.Pattern_2.Pattern_2
|
||||
import project.Data.Text.Regex_2
|
||||
import project.Data.Text.Regex_2.No_Such_Group
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Regex.Pattern.Match_Iterator_Value
|
||||
import project.Data.Text.Regex.Pattern.Pattern
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
@ -23,7 +23,7 @@ type Replacer
|
||||
|
||||
Implements a replacement for a regular expression.
|
||||
|
||||
Pattern_2.replace uses a Replacer to replace each regex match with
|
||||
Pattern.replace uses a Replacer to replace each regex match with
|
||||
a replacement string. This string can contain references to match
|
||||
groups from the original regex.
|
||||
|
||||
@ -40,7 +40,7 @@ type Replacer
|
||||
Arguments
|
||||
- replacement_string: a string, possibly containing group references,
|
||||
that will be used to provide a replacement in a regex match.
|
||||
new : Text -> Pattern_2 -> Replacer ! No_Such_Group
|
||||
new : Text -> Pattern -> Replacer ! No_Such_Group
|
||||
new replacement_string pattern =
|
||||
Replacer.Value (build_replacement_vector_cached replacement_string pattern)
|
||||
|
||||
@ -48,7 +48,7 @@ type Replacer
|
||||
|
||||
Arguments:
|
||||
- match: the match from the original string that is to be replaced.
|
||||
replace : Match_2 -> Text
|
||||
replace : Match -> Text
|
||||
replace self match =
|
||||
string_builder = StringBuilder.new
|
||||
self.replacement.each replacement->
|
||||
@ -82,7 +82,7 @@ group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))"
|
||||
|
||||
Uses Replacement_Cache to avoid rebuilding the vector for recently used
|
||||
replacement strings.
|
||||
build_replacement_vector_cached : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector_cached : Text -> Pattern -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector_cached replacement_string pattern =
|
||||
Replacer_Cache.get_or_set replacement_string _->
|
||||
build_replacement_vector replacement_string pattern
|
||||
@ -93,9 +93,9 @@ build_replacement_vector_cached replacement_string pattern =
|
||||
|
||||
Parse the replacement string into an alternating series of literal
|
||||
strings and group reference numbers.
|
||||
build_replacement_vector : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector : Text -> Pattern -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector replacement_string pattern =
|
||||
replacement_pattern = Regex_2.compile group_reference_regex
|
||||
replacement_pattern = Regex.compile group_reference_regex
|
||||
it = replacement_pattern.iterator replacement_string
|
||||
|
||||
builder = Vector.new_builder
|
||||
@ -117,14 +117,14 @@ build_replacement_vector replacement_string pattern =
|
||||
Parse a capture group reference.
|
||||
|
||||
Arguments:
|
||||
- pattern: the Pattern_2 used to initiate the replacement. This is used
|
||||
- pattern: the Pattern used to initiate the replacement. This is used
|
||||
to identify and validate capture groups.
|
||||
- match: the match of the replacement string against group_reference_regex.
|
||||
|
||||
Returns a Replacement: a group number, or, in the case of `$$`, a literal.
|
||||
|
||||
See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
|
||||
parse_group_number : Pattern_2 -> Match_2 -> Replacement ! No_Such_Group
|
||||
parse_group_number : Pattern -> Match -> Replacement ! No_Such_Group
|
||||
parse_group_number pattern match = case match.text.take 2 of
|
||||
"$$" -> Replacement.Literal "$"
|
||||
"$<" ->
|
||||
|
@ -1,86 +0,0 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Prim_Text_Helper
|
||||
import project.Data.Text.Regex.Pattern_2.Pattern_2
|
||||
import project.Data.Text.Text
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Errors.Common import Syntax_Error
|
||||
|
||||
polyglot java import java.util.regex.Pattern as Java_Pattern
|
||||
|
||||
## Compile the provided `expression` into a regex pattern that can be used for
|
||||
matching.
|
||||
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want to
|
||||
compile. Must be non-empty.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
|
||||
If an empty regex is used, `compile` throws an Illegal_Argument error.
|
||||
|
||||
? Why Compile?
|
||||
While many regex engines are able to cache ad-hoc patterns, it is often
|
||||
useful to be able to manually retain a pattern that you have computed. This
|
||||
function exists so you can hold onto the resultant `Pattern_2` object,
|
||||
instead of immediately proceeding to match using it.
|
||||
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error | Illegal_Argument
|
||||
compile self expression case_insensitive=Nothing =
|
||||
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
|
||||
options_string = if case_insensitive == True then "usgi" else "usg"
|
||||
|
||||
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
|
||||
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
|
||||
|
||||
Pattern_2.Value internal_regex_object
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Escape the special characters in `expression` such that the result is a
|
||||
valid literal pattern for the original string.
|
||||
|
||||
Arguments:
|
||||
- expression: The expression to escape metacharacters in.
|
||||
|
||||
> Example
|
||||
Turn a Text into a regex that matches that string exactly.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
|
||||
example_escape =
|
||||
literal_string = "\!\.|abcde"
|
||||
engine = Default_Engine.new
|
||||
engine.escape literal_string
|
||||
escape : Text -> Text
|
||||
escape self expression = Java_Pattern.quote expression
|
||||
|
||||
## An error that is emitted when there is no such group in the match for the
|
||||
provided `id`.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier of the group that was asked for but does not exist.
|
||||
type No_Such_Group
|
||||
Error (id : Text | Integer)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Provides a human-readable representation of the `No_Such_Group`.
|
||||
to_display_text : Text
|
||||
to_display_text self = case self.id of
|
||||
_ : Integer -> "No group exists with the index " + self.id.to_text + "."
|
||||
_ : Text -> "No group exists with the name " + self.id + "."
|
||||
|
||||
## A syntax error reported by the Truffle regex compiler.
|
||||
type Regex_Syntax_Error
|
||||
## PRIVATE
|
||||
|
||||
Arguments:
|
||||
- message: A description of the erroneous syntax.
|
||||
Error message
|
@ -1,112 +0,0 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Data.Text.Matching
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Pattern.Pattern
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Problem_Behavior.Problem_Behavior
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
## Represents regex matching mode.
|
||||
type Regex_Matcher
|
||||
## Regex matching mode.
|
||||
|
||||
Arguments:
|
||||
- case_sensitivity: Specifies whether the matching should be case
|
||||
sensitive.
|
||||
- multiline: Enables or disables the multiline option. Multiline
|
||||
specifies that the `^` and `$` pattern characters match the start and
|
||||
end of lines, as to well as the start and end of the input,
|
||||
respectively.
|
||||
- match_ascii: Enables or disables pure-ASCII matching for the regex. If
|
||||
you know your data only contains ASCII, you can enable this for a
|
||||
performance boost on some regex engines.
|
||||
- dot_matches_newline: Enables or disables the dot matches newline
|
||||
option. This specifies that the `.` special character should match
|
||||
everything _including_ newline characters. Without this flag, it
|
||||
matches all characters _except_ newlines.
|
||||
- comments: Enables or disables the comments mode for the regular
|
||||
expression. In comments mode, the following changes apply:
|
||||
- Whitespace within the pattern is ignored, except when within a
|
||||
character class or when preceded by an unescaped backslash, or within
|
||||
grouping constructs (e.g. `(?...)`).
|
||||
- When a line contains a `#` that is not in a character class and is
|
||||
not preceded by an unescaped backslash, all characters from the
|
||||
leftmost such `#` to the end of the line are ignored. That is to say;
|
||||
they act as 'comments' in the regex.
|
||||
Value (case_sensitivity : Case_Sensitivity = Case_Sensitivity.Sensitive) (multiline : Boolean = False) (match_ascii : Boolean = False) (dot_matches_newline : Boolean = False) (comments : Boolean = False)
|
||||
|
||||
## UNSTABLE
|
||||
Compiles a provided pattern according to the rules defined in this
|
||||
`Regex_Matcher`.
|
||||
compile : Text -> Pattern
|
||||
compile self pattern =
|
||||
case_insensitive = case self.case_sensitivity of
|
||||
Case_Sensitivity.Default -> False
|
||||
Case_Sensitivity.Sensitive -> False
|
||||
## TODO [RW] Currently locale is not supported in case-insensitive
|
||||
Regex matching. There are plans to revisit it:
|
||||
https://www.pivotaltracker.com/story/show/181313576
|
||||
Case_Sensitivity.Insensitive _ -> True
|
||||
Regex.compile pattern case_insensitive=case_insensitive match_ascii=self.match_ascii dot_matches_newline=self.dot_matches_newline multiline=self.multiline comments=self.comments
|
||||
|
||||
## UNSTABLE
|
||||
Checks if a name matches the provided criterion according to the specified
|
||||
matching strategy.
|
||||
|
||||
Arguments:
|
||||
- name: A `Text` representing the name being matched.
|
||||
- criterion: A `Text` representing the regular expression specifying the
|
||||
matching criterion.
|
||||
|
||||
> Example
|
||||
Check if the provided name matches a regular expression.
|
||||
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) . match_single_criterion "Foobar" "f.*" == True
|
||||
match_single_criterion : Text -> Text -> Boolean
|
||||
match_single_criterion self name criterion =
|
||||
self.compile criterion . matches name
|
||||
|
||||
## UNSTABLE
|
||||
Selects objects from an input list that match any of the provided criteria.
|
||||
|
||||
Arguments:
|
||||
- objects: A list of objects to be matched.
|
||||
- criteria: A list of texts representing the matching criteria. Their meaning
|
||||
depends on the matching strategy.
|
||||
- reorder: Specifies whether to reorder the matched objects according to the
|
||||
order of the matching criteria.
|
||||
If `False`, the matched entries are returned in the same order as in the
|
||||
input.
|
||||
If `True`, the matched entries are returned in the order of the criteria
|
||||
matching them. If a single object has been matched by multiple criteria, it
|
||||
is placed in the group belonging to the first matching criterion on the
|
||||
list.
|
||||
If a single criterion's group has more than one element, their relative
|
||||
order is the same as in the input.
|
||||
- name_mapper: A function mapping a provided object to its name, which will
|
||||
then be matched with the criteria. It is set to the identity function by
|
||||
default, thus allowing the input to be a list of names to match. But it can
|
||||
be overridden to enable matching more complex objects.
|
||||
- matcher: A `Matcher` instance specifying how to interpret the criterion.
|
||||
- on_problems: Specifies the behavior when a problem occurs during the
|
||||
function.
|
||||
By default, a warning is issued, but the operation proceeds.
|
||||
If set to `Report_Error`, the operation fails with a dataflow error.
|
||||
If set to `Ignore`, the operation proceeds without errors or warnings.
|
||||
|
||||
> Example
|
||||
Selects objects matching one of the provided patterns, preserving the input order.
|
||||
|
||||
Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive . match_criteria ["foo", "foobar", "quux", "baz", "Foo"] [".*ba.*", "f.*"] == ["foo", "foobar", "baz"]
|
||||
|
||||
> Example
|
||||
Selects pairs matching their first element with the provided criteria and
|
||||
ordering the result according to the order of criteria that matched them.
|
||||
|
||||
Text_Matcher.match_criteria [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10] ["bar", "foo"] reorder=True name_mapper=_.name == [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0]
|
||||
match_criteria : Vector Any -> Vector Text -> Boolean -> (Any -> Text) -> Problem_Behavior -> Vector Any ! Matching.No_Matches_Found
|
||||
match_criteria self objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning =
|
||||
Matching.match_criteria_implementation self objects criteria reorder name_mapper on_problems
|
@ -1,75 +0,0 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Locale.Locale
|
||||
import project.Data.Text.Matching
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Problem_Behavior.Problem_Behavior
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
## Represents exact text matching mode.
|
||||
type Text_Matcher
|
||||
## Represents exact text matching mode.
|
||||
Case_Sensitive
|
||||
|
||||
## Represents case-insensitive text matching mode.
|
||||
Case_Insensitive (locale:Locale=Locale.default)
|
||||
|
||||
## UNSTABLE
|
||||
Checks if a name matches the provided criterion according to the specified
|
||||
matching strategy.
|
||||
|
||||
Arguments:
|
||||
- name: A `Text` representing the name being matched.
|
||||
- criterion: A `Text` representing the name to be matched.
|
||||
|
||||
> Example
|
||||
Check if the provided name matches a regular expression.
|
||||
|
||||
Text_Matcher.match_single_criterion "Foobar" "foo" == False
|
||||
match_single_criterion : Text -> Text -> Boolean
|
||||
match_single_criterion self name criterion = case self of
|
||||
Text_Matcher.Case_Sensitive -> name == criterion
|
||||
Text_Matcher.Case_Insensitive locale -> name.equals_ignore_case criterion locale=locale
|
||||
|
||||
## UNSTABLE
|
||||
Selects objects from an input list that match any of the provided criteria.
|
||||
|
||||
Arguments:
|
||||
- objects: A list of objects to be matched.
|
||||
- criteria: A list of texts representing the matching criteria. Their meaning
|
||||
depends on the matching strategy.
|
||||
- reorder: Specifies whether to reorder the matched objects according to the
|
||||
order of the matching criteria.
|
||||
If `False`, the matched entries are returned in the same order as in the
|
||||
input.
|
||||
If `True`, the matched entries are returned in the order of the criteria
|
||||
matching them. If a single object has been matched by multiple criteria, it
|
||||
is placed in the group belonging to the first matching criterion on the
|
||||
list.
|
||||
If a single criterion's group has more than one element, their relative
|
||||
order is the same as in the input.
|
||||
- name_mapper: A function mapping a provided object to its name, which will
|
||||
then be matched with the criteria. It is set to the identity function by
|
||||
default, thus allowing the input to be a list of names to match. But it can
|
||||
be overridden to enable matching more complex objects.
|
||||
- matcher: A `Matcher` instance specifying how to interpret the criterion.
|
||||
- on_problems: Specifies the behavior when a problem occurs during the
|
||||
function.
|
||||
By default, a warning is issued, but the operation proceeds.
|
||||
If set to `Report_Error`, the operation fails with a dataflow error.
|
||||
If set to `Ignore`, the operation proceeds without errors or warnings.
|
||||
|
||||
> Example
|
||||
Selects objects matching one of the provided patterns, preserving the input order.
|
||||
|
||||
Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive . match_criteria ["foo", "foobar", "quux", "baz", "Foo"] [".*ba.*", "f.*"] == ["foo", "foobar", "baz"]
|
||||
|
||||
> Example
|
||||
Selects pairs matching their first element with the provided criteria and
|
||||
ordering the result according to the order of criteria that matched them.
|
||||
|
||||
Text_Matcher.match_criteria [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10] ["bar", "foo"] reorder=True name_mapper=_.name == [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0]
|
||||
match_criteria : Vector Any -> Vector Text -> Boolean -> (Any -> Text) -> Problem_Behavior -> Vector Any ! Matching.No_Matches_Found
|
||||
match_criteria self objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning =
|
||||
Matching.match_criteria_implementation self objects criteria reorder name_mapper on_problems
|
@ -94,10 +94,6 @@ import project.Data.Text.Line_Ending_Style.Line_Ending_Style
|
||||
import project.Data.Text.Location.Location
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
||||
import project.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
import project.Data.Text.Regex_Matcher.Regex_Matcher
|
||||
import project.Data.Text.Text_Matcher.Text_Matcher
|
||||
import project.Data.Text.Text_Ordering.Text_Ordering
|
||||
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
|
||||
import project.Data.Time.Date.Date
|
||||
@ -146,10 +142,6 @@ export project.Data.Text.Line_Ending_Style.Line_Ending_Style
|
||||
export project.Data.Text.Location.Location
|
||||
export project.Data.Text.Matching_Mode.Matching_Mode
|
||||
export project.Data.Text.Regex
|
||||
export project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
||||
export project.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
export project.Data.Text.Regex_Matcher.Regex_Matcher
|
||||
export project.Data.Text.Text_Matcher.Text_Matcher
|
||||
export project.Data.Text.Text_Ordering.Text_Ordering
|
||||
export project.Data.Text.Text_Sub_Range.Text_Sub_Range
|
||||
export project.Data.Time.Date.Date
|
||||
|
@ -1,12 +1,10 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Errors.Common.No_Such_Method
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Match.Match
|
||||
import Standard.Base.Errors.Common.No_Such_Method
|
||||
import Standard.Base.Network.HTTP.Response.Response
|
||||
import Standard.Base.Network.HTTP.Response_Body.Response_Body
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default.Match as Default_Engine_Match
|
||||
|
||||
from Standard.Table import Table, Column
|
||||
|
||||
from Standard.Image import Image, Read_Flag, Matrix
|
||||
@ -269,8 +267,7 @@ transactions_table =
|
||||
(enso_project.data / "food_shop_transactions.csv") . read
|
||||
|
||||
## An example regex match.
|
||||
match : Default_Engine_Match
|
||||
match : Match
|
||||
match =
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
pattern.match "aa ab abc a bc bcd" mode=Matching_Mode.First
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
pattern.match "aa ab abc a bc bcd"
|
||||
|
@ -6,71 +6,6 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class Regex_Utils {
|
||||
|
||||
/**
|
||||
* Obtains the names for named groups.
|
||||
*
|
||||
* <p>Assumes that the provided {@link Pattern} is syntactically valid. Behaviour is undefined if
|
||||
* run on a syntactically invalid pattern.
|
||||
*
|
||||
* @param pattern the pattern for which to get the group names
|
||||
* @return the names for the named groups in {@code pattern}
|
||||
*/
|
||||
public static String[] get_group_names(Pattern pattern) {
|
||||
String pattern_text = pattern.pattern();
|
||||
|
||||
char[] characters = pattern_text.toCharArray();
|
||||
ArrayList<String> names = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < pattern_text.length(); ++i) {
|
||||
char character = characters[i];
|
||||
|
||||
if (character == '\\') {
|
||||
++i;
|
||||
break;
|
||||
}
|
||||
|
||||
String header = "(?<";
|
||||
|
||||
if (pattern_text.startsWith(header, i)) {
|
||||
i += header.length();
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
|
||||
while (i < pattern_text.length()) {
|
||||
character = characters[i];
|
||||
|
||||
if (character == '>') {
|
||||
break;
|
||||
}
|
||||
|
||||
++i;
|
||||
|
||||
buffer.append(character);
|
||||
}
|
||||
|
||||
names.add(buffer.toString());
|
||||
}
|
||||
}
|
||||
|
||||
return names.toArray(new String[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for matches of the provided regular expression in the provided text.
|
||||
*
|
||||
* <p>This should behave exactly the same as `Regex.compile regex . find text` in Enso, it is here
|
||||
* only as a temporary workaround, because the Enso function gives wrong results on examples like
|
||||
* `Regex.compile "([0-9]+|[^0-9]+)" . find "1a2c"` where it returns `[1, a, 2]` instead of `[1,
|
||||
* a, 2, c]`.
|
||||
*/
|
||||
public static String[] find_all_matches(String regex, String text) {
|
||||
var allMatches = new ArrayList<String>();
|
||||
Matcher m = Pattern.compile(regex).matcher(text);
|
||||
while (m.find()) {
|
||||
allMatches.add(m.group());
|
||||
}
|
||||
return allMatches.toArray(new String[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a SQL-like pattern into a Regex with the same semantics.
|
||||
*
|
||||
@ -87,7 +22,7 @@ public class Regex_Utils {
|
||||
// Before inserting the converted wildcard, we append the accumulated characters, quoting
|
||||
// them first.
|
||||
if (acc.length() > 0) {
|
||||
result.append(Pattern.quote(acc.toString()));
|
||||
result.append(regexQuote(acc.toString()));
|
||||
acc.setLength(0);
|
||||
}
|
||||
|
||||
@ -103,7 +38,7 @@ public class Regex_Utils {
|
||||
|
||||
// If any trailing characters were left, we append them too.
|
||||
if (acc.length() > 0) {
|
||||
result.append(Pattern.quote(acc.toString()));
|
||||
result.append(regexQuote(acc.toString()));
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
|
@ -8,7 +8,7 @@ type Setup
|
||||
|
||||
make_expected_output_regex expected_output =
|
||||
parts = expected_output.split "???" . map Regex.escape
|
||||
Regex.compile (parts.join ".+") dot_matches_newline=True
|
||||
Regex.compile (parts.join ".+")
|
||||
|
||||
spec setup =
|
||||
run_test source_path =
|
||||
|
@ -2,6 +2,7 @@ from Standard.Base import all
|
||||
import Standard.Base.Data.Range.Empty_Error
|
||||
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
|
||||
import Standard.Base.Errors.Common.No_Such_Method
|
||||
import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Common.Unsupported_Argument_Types
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Base.Errors.Illegal_State.Illegal_State
|
||||
@ -143,8 +144,8 @@ spec = Test.group "Range" <|
|
||||
range.filter (Filter_Condition.Not_In [7, 3, 2]) . should_equal [1, 4, 5]
|
||||
|
||||
Test.expect_panic_with (range.filter (Filter_Condition.Starts_With "a")) No_Such_Method
|
||||
Test.expect_panic_with (range.filter (Filter_Condition.Like "a%")) Unsupported_Argument_Types
|
||||
Test.expect_panic_with (range.filter (Filter_Condition.Not_Like "a_")) Unsupported_Argument_Types
|
||||
range.filter (Filter_Condition.Like "a%") . should_fail_with Type_Error
|
||||
range.filter (Filter_Condition.Not_Like "a_") . should_fail_with Type_Error
|
||||
range.filter Filter_Condition.Is_True . should_equal []
|
||||
range.filter Filter_Condition.Is_False . should_equal []
|
||||
range.filter Filter_Condition.Is_Nothing . should_equal []
|
||||
|
@ -1,613 +0,0 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Span.Utf_16_Span
|
||||
import Standard.Base.Errors.Common.Syntax_Error
|
||||
|
||||
import Standard.Base.Data.Text.Matching_Mode.Matching_Mode
|
||||
from Standard.Base.Data.Text.Regex import No_Such_Group, Invalid_Option
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
|
||||
polyglot java import java.util.regex.Pattern as Java_Pattern
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
|
||||
default_mask = Java_Pattern.CANON_EQ.bit_or Java_Pattern.UNICODE_CASE . bit_or Java_Pattern.UNICODE_CHARACTER_CLASS
|
||||
|
||||
spec =
|
||||
Test.group "The default regex engine's options handling" <|
|
||||
|
||||
Test.specify "should convert options to Java" <|
|
||||
options = [Regex_Option.Comments, Regex_Option.Multiline, Default_Engine.Option.Unix_Lines]
|
||||
expected_mask = Java_Pattern.UNIX_LINES.bit_or Java_Pattern.COMMENTS . bit_or Java_Pattern.MULTILINE . bit_or default_mask
|
||||
actual_mask = Default_Engine.from_enso_options options
|
||||
|
||||
actual_mask . should_equal expected_mask
|
||||
|
||||
Test.specify "should specify the unicode options by default" <|
|
||||
actual_mask = Default_Engine.from_enso_options []
|
||||
|
||||
actual_mask . should_equal default_mask
|
||||
|
||||
Test.specify "should handle ascii matching by disabling unicode" <|
|
||||
actual_mask = Default_Engine.from_enso_options [Regex_Option.Ascii_Matching]
|
||||
actual_mask . should_equal 0
|
||||
|
||||
Test.specify "should result in an error when an option is invalid" <|
|
||||
Default_Engine.from_enso_options [""] . should_fail_with Invalid_Option
|
||||
Default_Engine.from_enso_options ["", Regex_Option.Ascii_Matching] . should_fail_with Invalid_Option
|
||||
|
||||
Test.group "The default regex engine (Default_Engine)" <|
|
||||
|
||||
Test.specify "should be able to compile patterns with no options" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "^a$" []
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options . should_equal []
|
||||
pattern.internal_pattern.flags . should_equal default_mask
|
||||
|
||||
Test.specify "should be able to compile patterns with global options" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "^a$" [Regex_Option.Multiline]
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options . should_equal [Regex_Option.Multiline]
|
||||
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.MULTILINE)
|
||||
|
||||
Test.specify "should be able to compile patterns with engine-specific options" <|
|
||||
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
|
||||
pattern = engine.compile "^a$" []
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options . should_equal [Default_Engine.Option.Literal_Pattern]
|
||||
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.LITERAL)
|
||||
|
||||
Test.specify "should be able to compile patterns with combined options" <|
|
||||
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
|
||||
pattern = engine.compile "^a$" [Regex_Option.Comments]
|
||||
pattern.engine . should_equal engine
|
||||
pattern.options.contains Default_Engine.Option.Literal_Pattern . should_be_true
|
||||
pattern.options.contains Regex_Option.Comments . should_be_true
|
||||
pattern.internal_pattern.flags . should_equal (default_mask . bit_or Java_Pattern.LITERAL . bit_or Java_Pattern.COMMENTS)
|
||||
|
||||
Test.specify "should return a syntax error of the regex syntax is invalid" <|
|
||||
engine = Default_Engine.new
|
||||
engine.compile "^(a" [] . should_fail_with Syntax_Error
|
||||
|
||||
Test.specify "should throw an invalid options error if an option is invalid" <|
|
||||
engine = Default_Engine.new
|
||||
engine.compile "^a$" ["invalid"] . should_fail_with Invalid_Option
|
||||
|
||||
Test.specify "should escape an expression for use as a literal" <|
|
||||
pattern = "http://example.com"
|
||||
engine = Default_Engine.new
|
||||
engine.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||
|
||||
Test.group "The default regex engine's Pattern.matches" <|
|
||||
engine = Default_Engine.new
|
||||
|
||||
Test.specify "should return True when the pattern matches against the input" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_true
|
||||
|
||||
Test.specify "should return False when the pattern doesn't match against the input" <|
|
||||
pattern = engine.compile "aaz" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_false
|
||||
|
||||
Test.specify "should check for full matches" <|
|
||||
pattern = engine.compile "f.o" []
|
||||
pattern.matches "foo" . should_be_true
|
||||
pattern.matches "foobar" . should_be_false
|
||||
|
||||
Test.group "The default regex engine's Pattern.match" <|
|
||||
engine = Default_Engine.new
|
||||
|
||||
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match.group 0 . should_equal input
|
||||
|
||||
Test.specify "should return `Nothing` if there are no matches in first mode" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "abc"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `match` at most N instances of the pattern in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=3
|
||||
match.length . should_equal 3
|
||||
match.at 0 . group 0 . should_equal "ab"
|
||||
match.at 1 . group 0 . should_equal "cd"
|
||||
match.at 2 . group 0 . should_equal "ef"
|
||||
|
||||
Test.specify "should `match` fewer than N instances when there are fewer than N in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdef"
|
||||
match = pattern.match input mode=5
|
||||
match.length . should_equal 3
|
||||
match.at 0 . group 0 . should_equal "ab"
|
||||
match.at 1 . group 0 . should_equal "cd"
|
||||
match.at 2 . group 0 . should_equal "ef"
|
||||
|
||||
Test.specify "should return `Nothing` when a counted match fails" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=3
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=Regex_Mode.All
|
||||
match.length . should_equal 5
|
||||
match.at 0 . group 0 . should_equal "ab"
|
||||
match.at 1 . group 0 . should_equal "cd"
|
||||
match.at 2 . group 0 . should_equal "ef"
|
||||
match.at 3 . group 0 . should_equal "gh"
|
||||
match.at 4 . group 0 . should_equal "ij"
|
||||
|
||||
Test.specify "should return `Nothing` when an all match match fails" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=Regex_Mode.All
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `match` the pattern against the entire input" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Regex_Mode.Full
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
match.group 0 . should_equal input
|
||||
|
||||
Test.specify "should return `Nothing` if a full match does not match the entire input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "aa ab"
|
||||
full_match = pattern.match input mode=Regex_Mode.Full
|
||||
full_match . should_equal Nothing
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should be able to `match` the pattern against bounded input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.match input mode=(Regex_Mode.Bounded 2 8)
|
||||
match.length . should_equal 3
|
||||
match.at 0 . group 0 . should_equal "cd"
|
||||
match.at 1 . group 0 . should_equal "ef"
|
||||
match.at 2 . group 0 . should_equal "gh"
|
||||
|
||||
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
|
||||
pattern = engine.compile "" []
|
||||
match_1 = pattern.match "" mode=Regex_Mode.All
|
||||
match_1.length . should_equal 1
|
||||
match_1.at 0 . start 0 . should_equal 0
|
||||
match_1.at 0 . end 0 . should_equal 0
|
||||
|
||||
match_2 = pattern.match "ABC" mode=Regex_Mode.All
|
||||
match_2.length . should_equal 4
|
||||
match_2.at 0 . start 0 . should_equal 0
|
||||
match_2.at 0 . end 0 . should_equal 0
|
||||
match_2.at 1 . start 0 . should_equal 1
|
||||
match_2.at 1 . end 0 . should_equal 1
|
||||
match_2.at 3 . start 0 . should_equal 3
|
||||
match_2.at 3 . end 0 . should_equal 3
|
||||
|
||||
Test.group "The default regex engine's Pattern.find" <|
|
||||
engine = Default_Engine.new
|
||||
|
||||
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input mode=Matching_Mode.First
|
||||
match . should_be_a Text
|
||||
match . should_equal "ab"
|
||||
|
||||
Test.specify "should return `Nothing` if there are no matches in first mode" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input mode=Matching_Mode.First
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `find` at most N instances of the pattern in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input mode=3
|
||||
match.length . should_equal 3
|
||||
match.at 0 . should_equal "ab"
|
||||
match.at 1 . should_equal "cd"
|
||||
match.at 2 . should_equal "ef"
|
||||
|
||||
Test.specify "should `find` fewer than N instances when there are fewer than N in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdef"
|
||||
match = pattern.find input mode=5
|
||||
match.length . should_equal 3
|
||||
match.at 0 . should_equal "ab"
|
||||
match.at 1 . should_equal "cd"
|
||||
match.at 2 . should_equal "ef"
|
||||
|
||||
Test.specify "should return `Nothing` when a counted match fails" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input mode=3
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `find` the all instances of the pattern in the input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input mode=Regex_Mode.All
|
||||
match.length . should_equal 5
|
||||
match.at 0 . should_equal "ab"
|
||||
match.at 1 . should_equal "cd"
|
||||
match.at 2 . should_equal "ef"
|
||||
match.at 3 . should_equal "gh"
|
||||
match.at 4 . should_equal "ij"
|
||||
|
||||
Test.specify "should return `Nothing` when an all match match fails" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input mode=Regex_Mode.All
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `find` the pattern against the entire input" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.find input mode=Regex_Mode.Full
|
||||
match . should_be_a Text
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should return `Nothing` if a full find does not match the entire input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "aa ab"
|
||||
full_match = pattern.find input mode=Regex_Mode.Full
|
||||
full_match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `find` the pattern against bounded input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input mode=(Regex_Mode.Bounded 2 8)
|
||||
match.length . should_equal 3
|
||||
match.at 0 . should_equal "cd"
|
||||
match.at 1 . should_equal "ef"
|
||||
match.at 2 . should_equal "gh"
|
||||
|
||||
match_2 = pattern.find input mode=(Regex_Mode.Bounded 2 8 mode=10)
|
||||
match_2.length . should_equal 3
|
||||
match_2.at 0 . should_equal "cd"
|
||||
match_2.at 1 . should_equal "ef"
|
||||
match_2.at 2 . should_equal "gh"
|
||||
|
||||
match_3 = pattern.find input mode=(Regex_Mode.Bounded 2 8 mode=2)
|
||||
match_3.length . should_equal 2
|
||||
match_3.at 0 . should_equal "cd"
|
||||
match_3.at 1 . should_equal "ef"
|
||||
|
||||
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
|
||||
engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"]
|
||||
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"]
|
||||
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Regex_Mode.Bounded 1 3) . should_equal ["1", "b"]
|
||||
|
||||
Test.group "The default regex engine's Pattern.split" <|
|
||||
engine = Default_Engine.new
|
||||
|
||||
Test.specify "should be able to `split` on the first instance of the pattern" <|
|
||||
pattern = engine.compile "cd" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.split input mode=Matching_Mode.First
|
||||
match.length . should_equal 2
|
||||
match.at 0 . should_equal "ab"
|
||||
match.at 1 . should_equal "efghij"
|
||||
|
||||
Test.specify "should return the original text if there are no matches in first mode" <|
|
||||
pattern = engine.compile "(aa)" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.split input mode=Matching_Mode.First
|
||||
match . should_equal ["abcdefghij"]
|
||||
|
||||
Test.specify "should be able to `split` on at most N instances of the pattern in the input" <|
|
||||
pattern = engine.compile "a" []
|
||||
input = "bacadaeaf"
|
||||
match = pattern.split input mode=3
|
||||
match.length . should_equal 4
|
||||
match.at 0 . should_equal "b"
|
||||
match.at 1 . should_equal "c"
|
||||
match.at 2 . should_equal "d"
|
||||
match.at 3 . should_equal "eaf"
|
||||
|
||||
Test.specify "should `split` on fewer than N instances when there are fewer than N in the input" <|
|
||||
pattern = engine.compile "a" []
|
||||
input = "bacadaeaf"
|
||||
match = pattern.split input mode=10
|
||||
match.length . should_equal 5
|
||||
match.at 0 . should_equal "b"
|
||||
match.at 1 . should_equal "c"
|
||||
match.at 2 . should_equal "d"
|
||||
match.at 3 . should_equal "e"
|
||||
match.at 4 . should_equal "f"
|
||||
|
||||
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
|
||||
pattern = engine.compile "(a)" []
|
||||
input = "bacadaeaf"
|
||||
match = pattern.split input mode=Regex_Mode.All
|
||||
match.length . should_equal 5
|
||||
match.at 0 . should_equal "b"
|
||||
match.at 1 . should_equal "c"
|
||||
match.at 2 . should_equal "d"
|
||||
match.at 3 . should_equal "e"
|
||||
match.at 4 . should_equal "f"
|
||||
|
||||
Test.group "The default regex engine's Pattern.replace" <|
|
||||
engine = Default_Engine.new
|
||||
|
||||
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
|
||||
pattern = engine.compile "abc" []
|
||||
input = "aa ab abc a bc abc"
|
||||
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
|
||||
match . should_be_a Text
|
||||
match . should_equal "aa ab REPLACED a bc abc"
|
||||
|
||||
Test.specify "should return the string unchanged if there are no matches to replace in first mode" <|
|
||||
pattern = engine.compile "xyz" []
|
||||
input = "aa ab ac ad"
|
||||
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace at most N instances of the pattern in the input" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED" mode=3
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED aa ax"
|
||||
|
||||
Test.specify "should replace fewer than N instances when there are fewer than N in the input" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED" mode=10
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
|
||||
|
||||
Test.specify "should return the input when a counted replace fails" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.replace input "REPLACED" mode=3
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the all instances of the pattern in the input" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
|
||||
|
||||
Test.specify "should return the input when an all replace fails" <|
|
||||
pattern = engine.compile "aa" []
|
||||
input = "abcdefghij"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the entire input only if it matches" <|
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
|
||||
match . should_equal "REPLACED"
|
||||
|
||||
Test.specify "should correctly replace entire input in Full mode even if partial matches are possible" <|
|
||||
pattern = engine.compile "(aa)+" []
|
||||
pattern.replace "aaa" "REPLACED" mode=Regex_Mode.Full . should_equal "aaa"
|
||||
pattern.replace "aaaa" "REPLACED" mode=Regex_Mode.Full . should_equal "REPLACED"
|
||||
|
||||
Test.specify "should return the input for a full replace if the pattern doesn't match the entire input" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "aa ab"
|
||||
full_match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
|
||||
full_match . should_equal input
|
||||
|
||||
Test.specify "should not perform overlapping replacements in counted mode" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "abcdefghij"
|
||||
result = pattern.replace input "REPLACED" mode=3
|
||||
result . should_equal "REPLACEDREPLACEDREPLACEDghij"
|
||||
|
||||
Test.specify "should not perform overlapping replacements in all mode" <|
|
||||
pattern = engine.compile "(..)" []
|
||||
input = "aa ab"
|
||||
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
|
||||
match . should_equal "REPLACEDREPLACEDb"
|
||||
|
||||
Test.specify "should handle capture groups in replacement" <|
|
||||
pattern = engine.compile "(?<capture>[a-z]+)" []
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=0 . should_equal "foo bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=1 . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=2 . should_equal "[foo] [bar], baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=3 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=4 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
|
||||
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=0 . should_equal "foo bar, baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=1 . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=2 . should_equal "[foo] [bar], baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=3 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=4 . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
|
||||
|
||||
Test.specify "should handle capture groups in replacement in All mode" <|
|
||||
pattern = engine.compile "([a-z]+)" []
|
||||
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.Full . should_equal "foo bar, baz"
|
||||
pattern.replace "foo" "[$1]" mode=Regex_Mode.Full . should_equal "[foo]"
|
||||
|
||||
pattern_2 = engine.compile '<a href="(?<addr>.*?)">(?<name>.*?)</a>' []
|
||||
pattern_2.replace '<a href="url">content</a>' "$2 <- $1" mode=Regex_Mode.Full . should_equal "content <- url"
|
||||
pattern_2.replace '<a href="url">content</a>' "${name} <- ${addr}" mode=Regex_Mode.Full . should_equal "content <- url"
|
||||
|
||||
Test.group "Match.group" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should return the full match with index 0" <|
|
||||
match.group 0 . should_equal "aa ab abc a bc bcd"
|
||||
|
||||
Test.specify "should return the group contents if it matches by index" <|
|
||||
match.group 1 . should_equal "aa ab "
|
||||
|
||||
Test.specify "should return the group contents if it matches by name" <|
|
||||
match.group "letters" . should_equal "abc a bc bcd"
|
||||
|
||||
Test.specify "should return Nothing if the group did not match" <|
|
||||
match.group 3 . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
|
||||
match.group "fail" . should_fail_with No_Such_Group
|
||||
match.group 5 . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should make named groups accessible by index" <|
|
||||
match.group 2 . should_equal (match.group "letters")
|
||||
|
||||
Test.group "Match.groups" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should return the results of all groups" <|
|
||||
groups = match.groups
|
||||
groups.length . should_equal 5
|
||||
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
|
||||
|
||||
Test.specify "should replace unmatched groups by a user-specified value" <|
|
||||
groups = match.groups "UNMATCHED"
|
||||
groups.length . should_equal 5
|
||||
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
|
||||
|
||||
Test.group "Match.named_groups" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should return the results of all named groups" <|
|
||||
groups = match.named_groups
|
||||
groups.size . should_equal 2
|
||||
groups.at "letters" . should_equal "abc a bc bcd"
|
||||
groups.at "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should replace unmatched groups by a user-specified value" <|
|
||||
groups = match.named_groups "UNMATCHED"
|
||||
groups.size . should_equal 2
|
||||
groups.at "letters" . should_equal "abc a bc bcd"
|
||||
groups.at "empty" . should_equal "UNMATCHED"
|
||||
|
||||
Test.group "Match.start" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should return the start of a group by index" <|
|
||||
match.start 1 . should_equal 0
|
||||
|
||||
Test.specify "should return the start of a group by name" <|
|
||||
match.start "letters" . should_equal 6
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.start 3 . should_equal Nothing
|
||||
match.start "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.start 5 . should_fail_with No_Such_Group
|
||||
match.start "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.end" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should return the end of a group by index" <|
|
||||
match.end 1 . should_equal 6
|
||||
|
||||
Test.specify "should return the end of a group by name" <|
|
||||
match.end "letters" . should_equal 18
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.end 3 . should_equal Nothing
|
||||
match.end "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.end 5 . should_fail_with No_Such_Group
|
||||
match.end "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.span" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should get the span of a group by index" <|
|
||||
match.span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
|
||||
|
||||
Test.specify "should get the span of a group by name" <|
|
||||
match.span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.span 3 . should_equal Nothing
|
||||
match.span "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.span 5 . should_fail_with No_Such_Group
|
||||
match.span "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.start_position" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should return the region start over which self match was performed" <|
|
||||
match.start_position . should_equal 0
|
||||
|
||||
Test.group "Match.end_position" <|
|
||||
engine = Default_Engine.new
|
||||
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input mode=Matching_Mode.First
|
||||
|
||||
Test.specify "should be a Match" <|
|
||||
match . should_be_a Default_Engine.Match.Value
|
||||
|
||||
Test.specify "should return the region end over which self match was performed" <|
|
||||
match.end_position . should_equal 18
|
||||
|
||||
main = Test_Suite.run_main spec
|
@ -1,78 +0,0 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Matching
|
||||
import Standard.Base.Errors.Common.No_Such_Method
|
||||
|
||||
from Standard.Test import Test, Test_Suite, Problems
|
||||
import Standard.Test.Extensions
|
||||
|
||||
type Foo_Error
|
||||
|
||||
spec = Test.group 'Matching Helper' <|
|
||||
Test.specify 'should match a single name with a single Text_Matcher criterion' <|
|
||||
Text_Matcher.Case_Sensitive.match_single_criterion "foo" "foo" . should_be_true
|
||||
Text_Matcher.Case_Sensitive.match_single_criterion "foobar" "foo" . should_be_false
|
||||
Text_Matcher.Case_Sensitive.match_single_criterion "foo" "f.*" . should_be_false
|
||||
Text_Matcher.Case_Sensitive.match_single_criterion "foo" "Foo" . should_be_false
|
||||
|
||||
Test.specify 'should correctly handle Unicode folding with Text_Matcher matching' <|
|
||||
Text_Matcher.Case_Sensitive.match_single_criterion '\u00E9' '\u0065\u{301}' . should_be_true
|
||||
Text_Matcher.Case_Sensitive.match_single_criterion 'é' '\u00E9' . should_be_true
|
||||
Text_Matcher.Case_Sensitive.match_single_criterion 'é' 'ę' . should_be_false
|
||||
|
||||
Test.specify 'should match a single name with a single regex criterion' <|
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "foo" . should_be_true
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foobar" "foo" . should_be_false
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "f.*" . should_be_true
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "foo.*" . should_be_true
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "F.*" . should_be_false
|
||||
|
||||
Test.specify 'should support case-insensitive matching' <|
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive).match_single_criterion "foo" "F.*" . should_be_true
|
||||
Text_Matcher.Case_Insensitive.match_single_criterion "foO" "FOo" . should_be_true
|
||||
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive).match_single_criterion "foo" "fF.*" . should_be_false
|
||||
Text_Matcher.Case_Insensitive.match_single_criterion "foo" "Foos" . should_be_false
|
||||
|
||||
# Small beta is equal to capital 'beta' which looks the same as capital 'b' but is a different symbol.
|
||||
Text_Matcher.Case_Insensitive.match_single_criterion "β" "Β" . should_be_true
|
||||
Text_Matcher.Case_Insensitive.match_single_criterion "β" "B" . should_be_false
|
||||
|
||||
Test.specify 'should match a list of names with a list of criteria, correctly handling reordering' <|
|
||||
Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "foo"] reorder=True . should_equal ["baz", "foo"]
|
||||
Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "foo"] reorder=False . should_equal ["foo", "baz"]
|
||||
|
||||
Test.specify 'should allow multiple matches to a single criterion (Regex)' <|
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "quux"] ["b.*"] reorder=True . should_equal ["bar", "baz"]
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "quux"] ["b.*", "foo"] reorder=False . should_equal ["foo", "bar", "baz"]
|
||||
|
||||
Test.specify 'should include the object only with the first criterion that matched it, avoiding duplication' <|
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "zap"] [".*z.*", "b.*"] reorder=True . should_equal ["baz", "zap", "bar"]
|
||||
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "zap"] [".*z.*", "b.*"] reorder=False . should_equal ["bar", "baz", "zap"]
|
||||
|
||||
Test.specify 'should correctly handle criteria which did not match anything' <|
|
||||
action = Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "unknown_column"] reorder=True on_problems=_
|
||||
tester = _.should_equal ["baz"]
|
||||
problems = [Matching.No_Matches_Found.Error ["unknown_column"]]
|
||||
Problems.test_problem_handling action problems tester
|
||||
|
||||
action_2 = Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "unknown_column_1", "unknown_column_2"] reorder=False on_problems=_
|
||||
problems_2 = [Matching.No_Matches_Found.Error ["unknown_column_1", "unknown_column_2"]]
|
||||
Problems.test_problem_handling action_2 problems_2 tester
|
||||
|
||||
Test.specify 'should correctly work with complex object using a function extracting their names' <|
|
||||
pairs = [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10]
|
||||
selected = [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0]
|
||||
Text_Matcher.Case_Sensitive.match_criteria pairs ["bar", "foo"] reorder=True name_mapper=_.first . should_equal selected
|
||||
|
||||
Text_Matcher.Case_Sensitive.match_criteria [1, 2, 3] ["2"] name_mapper=_.to_text . should_equal [2]
|
||||
|
||||
Test.specify 'should correctly forward errors' <|
|
||||
Text_Matcher.Case_Sensitive.match_criteria (Error.throw Foo_Error) [] . should_fail_with Foo_Error
|
||||
Text_Matcher.Case_Sensitive.match_criteria [] (Error.throw Foo_Error) . should_fail_with Foo_Error
|
||||
(Error.throw Foo_Error).match_criteria [] [] . should_fail_with Foo_Error
|
||||
Text_Matcher.Case_Sensitive.match_criteria [1, 2, 3] ["2"] name_mapper=(x-> if x == 3 then Error.throw Foo_Error else x.to_text) . should_fail_with Foo_Error
|
||||
|
||||
Test.expect_panic_with matcher=No_Such_Method <|
|
||||
Text_Matcher.Case_Sensitive.match_criteria ["a"] ["a"] name_mapper=_.nonexistent_function
|
||||
|
||||
main = Test_Suite.run_main spec
|
@ -1,487 +0,0 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Span.Span
|
||||
import Standard.Base.Data.Text.Span.Utf_16_Span
|
||||
import Standard.Base.Data.Text.Regex.Match_2.Match_2
|
||||
import Standard.Base.Data.Text.Regex.Pattern_2.Pattern_2
|
||||
import Standard.Base.Data.Text.Regex.Replacer.Replacer
|
||||
import Standard.Base.Data.Text.Regex_2
|
||||
import Standard.Base.Data.Text.Regex_2.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
|
||||
spec =
|
||||
Test.group "Compile" <|
|
||||
Test.specify "should be able to be compiled" <|
|
||||
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
|
||||
pattern . should_be_a Pattern_2
|
||||
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
|
||||
Regex_2.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
|
||||
|
||||
Test.specify "should disallow empty patterns in `compile`" <|
|
||||
Regex_2.compile "" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Escape" <|
|
||||
Test.specify "should escape an expression for use as a literal" <|
|
||||
pattern = "http://example.com"
|
||||
Regex_2.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||
|
||||
Test.group "Pattern.matches" <|
|
||||
Test.specify "should return True when the pattern matches against the input" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_true
|
||||
|
||||
Test.specify "should return False when the pattern doesn't match against the input" <|
|
||||
pattern = Regex_2.compile "aaz"
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_false
|
||||
|
||||
Test.specify "should check for full matches" <|
|
||||
pattern = Regex_2.compile "f.o"
|
||||
pattern.matches "foo" . should_be_true
|
||||
pattern.matches "foobar" . should_be_false
|
||||
|
||||
Test.specify "`matches` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.matches "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Pattern.match" <|
|
||||
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
match.text 0 . should_equal input
|
||||
|
||||
Test.specify "should return `Nothing` if there are no matches in first mode" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "abc"
|
||||
match = pattern.match input
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
matches = pattern.match_all input
|
||||
matches.length . should_equal 5
|
||||
matches.at 0 . text 0 . should_equal "ab"
|
||||
matches.at 1 . text 0 . should_equal "cd"
|
||||
matches.at 2 . text 0 . should_equal "ef"
|
||||
matches.at 3 . text 0 . should_equal "gh"
|
||||
matches.at 4 . text 0 . should_equal "ij"
|
||||
|
||||
Test.specify "should return `[]` when an all match match fails" <|
|
||||
pattern = Regex_2.compile "(aa)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.match_all input
|
||||
match . should_equal []
|
||||
|
||||
Test.specify "`match` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.match "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`match_all` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.match_all "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Pattern_2.find and .find_all" <|
|
||||
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input
|
||||
match . should_be_a Text
|
||||
match . should_equal "ab"
|
||||
|
||||
Test.specify "should return `Nothing` if there are no matches in first mode" <|
|
||||
pattern = Regex_2.compile "(aa)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `find` the all instances of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find_all input
|
||||
match.length . should_equal 5
|
||||
match.at 0 . should_equal "ab"
|
||||
match.at 1 . should_equal "cd"
|
||||
match.at 2 . should_equal "ef"
|
||||
match.at 3 . should_equal "gh"
|
||||
match.at 4 . should_equal "ij"
|
||||
|
||||
Test.specify "should return `[]` when an all match match fails" <|
|
||||
pattern = Regex_2.compile "(aa)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find_all input
|
||||
match . should_equal []
|
||||
|
||||
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
|
||||
Regex_2.compile "(a+|1+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"]
|
||||
|
||||
Test.specify "`find` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.find "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`find_all` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.find_all "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Pattern_2.split" <|
|
||||
Test.specify "should be able to `split` on the first instance of the pattern" <|
|
||||
pattern = Regex_2.compile "cd"
|
||||
input = "abcdefcdghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["ab", "efcdghij"]
|
||||
|
||||
Test.specify "should return the original text if there are no matches in first mode" <|
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["abcdefghij"]
|
||||
|
||||
Test.specify "should return the original text if there are no matches in all mode" <|
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["abcdefghij"]
|
||||
|
||||
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "a"
|
||||
pattern.split "bacadaeaf" . should_equal ["b", "c", "d", "e", "f"]
|
||||
pattern.split "baab" . should_equal ["b", "", "b"]
|
||||
pattern.split "aaa" . should_equal ["", "", "", ""]
|
||||
pattern.split "" . should_equal [""]
|
||||
pattern.split "a" . should_equal ["", ""]
|
||||
pattern.split "abaca" . should_equal ["", "b", "c", ""]
|
||||
|
||||
Test.specify "should split without normalization" <|
|
||||
pattern = Regex_2.compile "s"
|
||||
pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb']
|
||||
|
||||
Test.group "Pattern_2.tokenize" <|
|
||||
Test.specify "can tokenize with simple regexes without capturing groups"
|
||||
Regex_2.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"]
|
||||
Regex_2.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"]
|
||||
Regex_2.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"]
|
||||
|
||||
Test.specify "can tokenize with regexes with capturing groups"
|
||||
Regex_2.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
Regex_2.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""]
|
||||
|
||||
Test.specify "ignores non-capturing groups"
|
||||
Regex_2.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(?<foo>\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
|
||||
Test.specify "ignores nested groups"
|
||||
Regex_2.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(?<foo>\d(?<bar>\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
Regex_2.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"]
|
||||
|
||||
Test.specify "non-participating groups are rendered as the empty string"
|
||||
Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_4_0" . should_equal ['340']
|
||||
Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_q_0" . should_equal ['3q0']
|
||||
|
||||
Test.specify "handles unicode" <|
|
||||
Regex_2.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||
# `+` only applies to the accent `\u{301}`, not to the entire grapheme.
|
||||
Regex_2.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||
Regex_2.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}a\u{301}']
|
||||
Regex_2.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"]
|
||||
|
||||
Test.specify "examples are correct" <|
|
||||
Regex_2.compile "..." . tokenize "ABCDEF" . should_equal ["ABC","DEF"]
|
||||
Regex_2.compile "(.).(.)" . tokenize "ABCDEF" . should_equal ["AC","DF"]
|
||||
Regex_2.compile "(\S+)(?:\s+|$)" . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' . should_equal ["Hello","Big","Wide","World","Goodbye!"]
|
||||
|
||||
Test.group "Pattern_2.replace" <|
|
||||
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "abc"
|
||||
input = "aa ab abc a bc abc"
|
||||
match = pattern.replace input "REPLACED" only_first=True
|
||||
match . should_be_a Text
|
||||
match . should_equal "aa ab REPLACED a bc abc"
|
||||
|
||||
Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <|
|
||||
pattern = Regex_2.compile "xyz"
|
||||
input = "aa ab ac ad"
|
||||
match = pattern.replace input "REPLACED" only_first=True
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the all instances of the pattern in the input" <|
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
|
||||
|
||||
Test.specify "should return the input when an all replace fails" <|
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "abcdefghij"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the entire input only if it matches" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACED"
|
||||
|
||||
Test.specify "should not perform overlapping replacements in all mode" <|
|
||||
pattern = Regex_2.compile "(..)"
|
||||
input = "aa ab"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACEDREPLACEDb"
|
||||
|
||||
Test.specify "should handle capture groups in replacement" <|
|
||||
pattern = Regex_2.compile "(?<capture>[a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
pattern.replace "foo bar, baz" "[$<capture>]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$<capture>]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
Test.specify "should handle unicode in capture group names" <|
|
||||
pattern = Regex_2.compile "(?<건반>[a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]"
|
||||
|
||||
Text.group "should correctly evaluate documentation examples" <|
|
||||
Test.specify "example 1" <|
|
||||
pattern = Regex_2.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' . should_equal 'ba'
|
||||
|
||||
Test.specify "example 2" <|
|
||||
pattern = Regex_2.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!'
|
||||
|
||||
Test.specify "example 3" <|
|
||||
pattern = Regex_2.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!'
|
||||
|
||||
Test.specify "example 4" <|
|
||||
pattern = Regex_2.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz'
|
||||
|
||||
Test.specify "example 5" <|
|
||||
pattern = Regex_2.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match . should_equal "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
Test.specify "example 6" <|
|
||||
pattern = Regex_2.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
|
||||
|
||||
Test.specify "`replace` with an empty pattern should be an error" <|
|
||||
pattern = Regex_2.compile ""
|
||||
pattern.replace "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Match.text" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the full match with index 0" <|
|
||||
match.text 0 . should_equal "aa ab abc a bc bcd"
|
||||
|
||||
Test.specify "should return the group contents if it matches by index" <|
|
||||
match.text 1 . should_equal "aa ab "
|
||||
|
||||
Test.specify "should return the group contents if it matches by name" <|
|
||||
match.text "letters" . should_equal "abc a bc bcd"
|
||||
|
||||
Test.specify "should return Nothing if the group did not match" <|
|
||||
match.text 3 . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
|
||||
match.text "fail" . should_fail_with No_Such_Group
|
||||
match.text 5 . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should make named groups accessible by index" <|
|
||||
match.text 2 . should_equal (match.text "letters")
|
||||
|
||||
Test.group "Match.groups" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the results of all groups" <|
|
||||
groups = match.groups
|
||||
groups.length . should_equal 5
|
||||
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
|
||||
|
||||
Test.specify "should replace unmatched groups by a user-specified value" <|
|
||||
groups = match.groups "UNMATCHED"
|
||||
groups.length . should_equal 5
|
||||
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
|
||||
|
||||
Test.group "Match.named_groups" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2.Value
|
||||
|
||||
Test.specify "should return the results of all named groups" <|
|
||||
groups = match.named_groups
|
||||
groups.keys.sort . should_equal ["empty", "letters"]
|
||||
groups.size . should_equal 2
|
||||
groups.at "letters" . should_equal "abc a bc bcd"
|
||||
groups.at "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should replace unmatched groups by a user-specified value" <|
|
||||
groups = match.named_groups "UNMATCHED"
|
||||
groups.size . should_equal 2
|
||||
groups.at "letters" . should_equal "abc a bc bcd"
|
||||
groups.at "empty" . should_equal "UNMATCHED"
|
||||
|
||||
Test.group "Match.start" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the start of a group by index" <|
|
||||
match.start 1 . should_equal 0
|
||||
|
||||
Test.specify "should return the start of a group by name" <|
|
||||
match.start "letters" . should_equal 6
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.start 3 . should_equal Nothing
|
||||
match.start "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.start 5 . should_fail_with No_Such_Group
|
||||
match.start "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.end" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the end of a group by index" <|
|
||||
match.end 1 . should_equal 6
|
||||
|
||||
Test.specify "should return the end of a group by name" <|
|
||||
match.end "letters" . should_equal 18
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.end 3 . should_equal Nothing
|
||||
match.end "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.end 5 . should_fail_with No_Such_Group
|
||||
match.end "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf_16_start" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the start of a group by index" <|
|
||||
match.utf_16_start 1 . should_equal 0
|
||||
|
||||
Test.specify "should return the start of a group by name" <|
|
||||
match.utf_16_start "letters" . should_equal 6
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf_16_start 3 . should_equal Nothing
|
||||
match.utf_16_start "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf_16_start 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_start "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf_16_end" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should return the end of a group by index" <|
|
||||
match.utf_16_end 1 . should_equal 6
|
||||
|
||||
Test.specify "should return the end of a group by name" <|
|
||||
match.utf_16_end "letters" . should_equal 18
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf_16_end 3 . should_equal Nothing
|
||||
match.utf_16_end "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf_16_end 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_end "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.span" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should get the span of a group by index" <|
|
||||
match.span 1 . should_equal (Span.Value (0.up_to 6) input)
|
||||
|
||||
Test.specify "should get the span of a group by name" <|
|
||||
match.span "letters" . should_equal (Span.Value (6.up_to 18) input)
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.span 3 . should_equal Nothing
|
||||
match.span "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.span 5 . should_fail_with No_Such_Group
|
||||
match.span "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf_16_span" <|
|
||||
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match_2
|
||||
|
||||
Test.specify "should get the UTF16 span of a group by index" <|
|
||||
match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
|
||||
|
||||
Test.specify "should get the UTF16 span of a group by name" <|
|
||||
match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf_16_span 3 . should_equal Nothing
|
||||
match.utf_16_span "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf_16_span 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_span "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "caching" <|
|
||||
Test.specify "Replacer cache drops old values" <|
|
||||
pattern = Regex_2.compile('([a-c])')
|
||||
|
||||
# Add enough values to flush out the first values.
|
||||
0.up_to get_lru_size+1 . map i->
|
||||
result = pattern.replace "abcdef" ("$1$1x" + i.to_text)
|
||||
result . should_not_equal Nothing
|
||||
replacer_cache_lookup "$1$1x0" . should_equal Nothing
|
||||
replacer_cache_lookup "$1$1x1" . should_not_equal Nothing
|
||||
|
||||
main = Test_Suite.run_main spec
|
@ -1,30 +1,507 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Span.Span
|
||||
import Standard.Base.Data.Text.Span.Utf_16_Span
|
||||
import Standard.Base.Data.Text.Regex
|
||||
import Standard.Base.Data.Text.Regex.Match.Match
|
||||
import Standard.Base.Data.Text.Regex.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex.Pattern.Pattern
|
||||
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
|
||||
import Standard.Base.Data.Text.Regex.Replacer.Replacer
|
||||
import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
|
||||
spec =
|
||||
Test.group "Regex options handling" <|
|
||||
Test.specify "should work properly with flag options" <|
|
||||
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[]
|
||||
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
|
||||
|
||||
Test.specify "should properly override vector options" <|
|
||||
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[Regex_Option.Multiline, Regex_Option.Case_Insensitive]
|
||||
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Case_Insensitive, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
|
||||
|
||||
Test.group "Regexes" <|
|
||||
Test.group "Compile" <|
|
||||
Test.specify "should be able to be compiled" <|
|
||||
pattern = Regex.compile "(?<dots>..)" case_insensitive=True
|
||||
pattern . should_be_a Default_Engine.Pattern.Value
|
||||
pattern.options . should_equal [Regex_Option.Case_Insensitive]
|
||||
pattern . should_be_a Pattern
|
||||
|
||||
Test.specify "should be able to be escaped" <|
|
||||
pattern = "http://example.com"
|
||||
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
|
||||
Regex.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
|
||||
|
||||
## TODO: Missing tests for No_Such_Group_Error
|
||||
Test.specify "should disallow empty patterns in `compile`" <|
|
||||
Regex.compile "" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Escape" <|
|
||||
Test.specify "should escape an expression for use as a literal" <|
|
||||
Regex.escape "[a-z\d]+" . should_equal '\\[a-z\\d\\]\\+'
|
||||
|
||||
Test.group "Pattern.matches" <|
|
||||
Test.specify "should return True when the pattern matches against the input" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_true
|
||||
|
||||
Test.specify "should return False when the pattern doesn't match against the input" <|
|
||||
pattern = Regex.compile "aaz"
|
||||
input = "aa ab abc a bc bcd"
|
||||
pattern.matches input . should_be_false
|
||||
|
||||
Test.specify "should check for full matches" <|
|
||||
pattern = Regex.compile "f.o"
|
||||
pattern.matches "foo" . should_be_true
|
||||
pattern.matches "foobar" . should_be_false
|
||||
|
||||
Test.specify "`matches` with an empty pattern should be an error" <|
|
||||
pattern = Regex.compile ""
|
||||
pattern.matches "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`matches` against a non-Text should fail with Illegal_Argument" <|
|
||||
pattern = Regex.compile "abc"
|
||||
pattern.matches 1 . should_fail_with Type_Error
|
||||
|
||||
Test.group "Pattern.match and .match_all" <|
|
||||
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
match.text 0 . should_equal input
|
||||
|
||||
Test.specify "should return `Nothing` if there are no matches in first mode" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "abc"
|
||||
match = pattern.match input
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
|
||||
pattern = Regex.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
matches = pattern.match_all input
|
||||
matches.length . should_equal 5
|
||||
matches.at 0 . text 0 . should_equal "ab"
|
||||
matches.at 1 . text 0 . should_equal "cd"
|
||||
matches.at 2 . text 0 . should_equal "ef"
|
||||
matches.at 3 . text 0 . should_equal "gh"
|
||||
matches.at 4 . text 0 . should_equal "ij"
|
||||
|
||||
Test.specify "should return `[]` when an all match match fails" <|
|
||||
pattern = Regex.compile "(aa)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.match_all input
|
||||
match . should_equal []
|
||||
|
||||
Test.specify "`match` with an empty pattern should be an error" <|
|
||||
pattern = Regex.compile ""
|
||||
pattern.match "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`match_all` with an empty pattern should be an error" <|
|
||||
pattern = Regex.compile ""
|
||||
pattern.match_all "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`match` against a non-Text should fail with Illegal_Argument" <|
|
||||
pattern = Regex.compile "abc"
|
||||
pattern.match 1 . should_fail_with Type_Error
|
||||
|
||||
Test.specify "`match_all` against a non-Text should fail with Illegal_Argument" <|
|
||||
pattern = Regex.compile "abc"
|
||||
pattern.match_all 1 . should_fail_with Type_Error
|
||||
|
||||
Test.group "Pattern.find and .find_all" <|
|
||||
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
|
||||
pattern = Regex.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input
|
||||
match . should_be_a Text
|
||||
match . should_equal "ab"
|
||||
|
||||
Test.specify "should return `Nothing` if there are no matches in first mode" <|
|
||||
pattern = Regex.compile "(aa)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find input
|
||||
match . should_equal Nothing
|
||||
|
||||
Test.specify "should be able to `find` the all instances of the pattern in the input" <|
|
||||
pattern = Regex.compile "(..)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find_all input
|
||||
match.length . should_equal 5
|
||||
match.at 0 . should_equal "ab"
|
||||
match.at 1 . should_equal "cd"
|
||||
match.at 2 . should_equal "ef"
|
||||
match.at 3 . should_equal "gh"
|
||||
match.at 4 . should_equal "ij"
|
||||
|
||||
Test.specify "should return `[]` when an all match match fails" <|
|
||||
pattern = Regex.compile "(aa)"
|
||||
input = "abcdefghij"
|
||||
match = pattern.find_all input
|
||||
match . should_equal []
|
||||
|
||||
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
|
||||
Regex.compile "(a+|1+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
Regex.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
|
||||
Regex.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"]
|
||||
|
||||
Test.specify "`find` with an empty pattern should be an error" <|
|
||||
pattern = Regex.compile ""
|
||||
pattern.find "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`find_all` with an empty pattern should be an error" <|
|
||||
pattern = Regex.compile ""
|
||||
pattern.find_all "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.group "Pattern.split" <|
|
||||
Test.specify "should be able to `split` on the first instance of the pattern" <|
|
||||
pattern = Regex.compile "cd"
|
||||
input = "abcdefcdghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["ab", "efcdghij"]
|
||||
|
||||
Test.specify "should return the original text if there are no matches in first mode" <|
|
||||
pattern = Regex.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["abcdefghij"]
|
||||
|
||||
Test.specify "should return the original text if there are no matches in all mode" <|
|
||||
pattern = Regex.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["abcdefghij"]
|
||||
|
||||
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
|
||||
pattern = Regex.compile "a"
|
||||
pattern.split "bacadaeaf" . should_equal ["b", "c", "d", "e", "f"]
|
||||
pattern.split "baab" . should_equal ["b", "", "b"]
|
||||
pattern.split "aaa" . should_equal ["", "", "", ""]
|
||||
pattern.split "" . should_equal [""]
|
||||
pattern.split "a" . should_equal ["", ""]
|
||||
pattern.split "abaca" . should_equal ["", "b", "c", ""]
|
||||
|
||||
Test.specify "should split without normalization" <|
|
||||
pattern = Regex.compile "s"
|
||||
pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb']
|
||||
|
||||
Test.specify "`split` against a non-Text should fail with Illegal_Argument" <|
|
||||
pattern = Regex.compile "abc"
|
||||
pattern.split 1 . should_fail_with Type_Error
|
||||
|
||||
Test.group "Pattern.tokenize" <|
|
||||
Test.specify "can tokenize with simple regexes without capturing groups"
|
||||
Regex.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"]
|
||||
Regex.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"]
|
||||
Regex.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"]
|
||||
|
||||
Test.specify "can tokenize with regexes with capturing groups"
|
||||
Regex.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
Regex.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""]
|
||||
|
||||
Test.specify "ignores non-capturing groups"
|
||||
Regex.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex.compile "(?<foo>\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
|
||||
Test.specify "ignores nested groups"
|
||||
Regex.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex.compile "(?<foo>\d(?<bar>\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
Regex.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"]
|
||||
|
||||
Test.specify "non-participating groups are rendered as the empty string"
|
||||
Regex.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_4_0" . should_equal ['340']
|
||||
Regex.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_q_0" . should_equal ['3q0']
|
||||
|
||||
Test.specify "handles unicode" <|
|
||||
Regex.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||
# `+` only applies to the accent `\u{301}`, not to the entire grapheme.
|
||||
Regex.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||
Regex.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}a\u{301}']
|
||||
Regex.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"]
|
||||
|
||||
Test.specify "examples are correct" <|
|
||||
Regex.compile "..." . tokenize "ABCDEF" . should_equal ["ABC","DEF"]
|
||||
Regex.compile "(.).(.)" . tokenize "ABCDEF" . should_equal ["AC","DF"]
|
||||
Regex.compile "(\S+)(?:\s+|$)" . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' . should_equal ["Hello","Big","Wide","World","Goodbye!"]
|
||||
|
||||
Test.group "Pattern.replace" <|
|
||||
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
|
||||
pattern = Regex.compile "abc"
|
||||
input = "aa ab abc a bc abc"
|
||||
match = pattern.replace input "REPLACED" only_first=True
|
||||
match . should_be_a Text
|
||||
match . should_equal "aa ab REPLACED a bc abc"
|
||||
|
||||
Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <|
|
||||
pattern = Regex.compile "xyz"
|
||||
input = "aa ab ac ad"
|
||||
match = pattern.replace input "REPLACED" only_first=True
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the all instances of the pattern in the input" <|
|
||||
pattern = Regex.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
|
||||
|
||||
Test.specify "should return the input when an all replace fails" <|
|
||||
pattern = Regex.compile "aa"
|
||||
input = "abcdefghij"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal input
|
||||
|
||||
Test.specify "should be able to replace the entire input only if it matches" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACED"
|
||||
|
||||
Test.specify "should not perform overlapping replacements in all mode" <|
|
||||
pattern = Regex.compile "(..)"
|
||||
input = "aa ab"
|
||||
match = pattern.replace input "REPLACED"
|
||||
match . should_equal "REPLACEDREPLACEDb"
|
||||
|
||||
Test.specify "should handle capture groups in replacement" <|
|
||||
pattern = Regex.compile "(?<capture>[a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
pattern.replace "foo bar, baz" "[$<capture>]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$<capture>]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz"
|
||||
pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]"
|
||||
pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz"
|
||||
|
||||
Test.specify "should handle unicode in capture group names" <|
|
||||
pattern = Regex.compile "(?<건반>[a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]"
|
||||
|
||||
Text.group "should correctly evaluate documentation examples" <|
|
||||
Test.specify "example 1" <|
|
||||
pattern = Regex.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' . should_equal 'ba'
|
||||
|
||||
Test.specify "example 2" <|
|
||||
pattern = Regex.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!'
|
||||
|
||||
Test.specify "example 3" <|
|
||||
pattern = Regex.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!'
|
||||
|
||||
Test.specify "example 4" <|
|
||||
pattern = Regex.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz'
|
||||
|
||||
Test.specify "example 5" <|
|
||||
pattern = Regex.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match . should_equal "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
Test.specify "example 6" <|
|
||||
pattern = Regex.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
|
||||
|
||||
Test.specify "`replace` with an empty pattern should be an error" <|
|
||||
pattern = Regex.compile ""
|
||||
pattern.replace "ABC" . should_fail_with Illegal_Argument
|
||||
|
||||
Test.specify "`replace` against a non-Text should fail with Illegal_Argument" <|
|
||||
pattern = Regex.compile "abc"
|
||||
pattern.replace 1 "abc" . should_fail_with Type_Error
|
||||
|
||||
Test.group "Match.text" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should return the full match with index 0" <|
|
||||
match.text 0 . should_equal "aa ab abc a bc bcd"
|
||||
|
||||
Test.specify "should return the group contents if it matches by index" <|
|
||||
match.text 1 . should_equal "aa ab "
|
||||
|
||||
Test.specify "should return the group contents if it matches by name" <|
|
||||
match.text "letters" . should_equal "abc a bc bcd"
|
||||
|
||||
Test.specify "should return Nothing if the group did not match" <|
|
||||
match.text 3 . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
|
||||
match.text "fail" . should_fail_with No_Such_Group
|
||||
match.text 5 . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should make named groups accessible by index" <|
|
||||
match.text 2 . should_equal (match.text "letters")
|
||||
|
||||
Test.group "Match.groups" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should return the results of all groups" <|
|
||||
groups = match.groups
|
||||
groups.length . should_equal 5
|
||||
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
|
||||
|
||||
Test.specify "should replace unmatched groups by a user-specified value" <|
|
||||
groups = match.groups "UNMATCHED"
|
||||
groups.length . should_equal 5
|
||||
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
|
||||
|
||||
Test.group "Match.named_groups" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match.Value
|
||||
|
||||
Test.specify "should return the results of all named groups" <|
|
||||
groups = match.named_groups
|
||||
groups.keys.sort . should_equal ["empty", "letters"]
|
||||
groups.size . should_equal 2
|
||||
groups.at "letters" . should_equal "abc a bc bcd"
|
||||
groups.at "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should replace unmatched groups by a user-specified value" <|
|
||||
groups = match.named_groups "UNMATCHED"
|
||||
groups.size . should_equal 2
|
||||
groups.at "letters" . should_equal "abc a bc bcd"
|
||||
groups.at "empty" . should_equal "UNMATCHED"
|
||||
|
||||
Test.group "Match.start" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should return the start of a group by index" <|
|
||||
match.start 1 . should_equal 0
|
||||
|
||||
Test.specify "should return the start of a group by name" <|
|
||||
match.start "letters" . should_equal 6
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.start 3 . should_equal Nothing
|
||||
match.start "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.start 5 . should_fail_with No_Such_Group
|
||||
match.start "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.end" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should return the end of a group by index" <|
|
||||
match.end 1 . should_equal 6
|
||||
|
||||
Test.specify "should return the end of a group by name" <|
|
||||
match.end "letters" . should_equal 18
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.end 3 . should_equal Nothing
|
||||
match.end "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.end 5 . should_fail_with No_Such_Group
|
||||
match.end "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf_16_start" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should return the start of a group by index" <|
|
||||
match.utf_16_start 1 . should_equal 0
|
||||
|
||||
Test.specify "should return the start of a group by name" <|
|
||||
match.utf_16_start "letters" . should_equal 6
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf_16_start 3 . should_equal Nothing
|
||||
match.utf_16_start "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf_16_start 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_start "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf_16_end" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should return the end of a group by index" <|
|
||||
match.utf_16_end 1 . should_equal 6
|
||||
|
||||
Test.specify "should return the end of a group by name" <|
|
||||
match.utf_16_end "letters" . should_equal 18
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf_16_end 3 . should_equal Nothing
|
||||
match.utf_16_end "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf_16_end 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_end "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.span" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should get the span of a group by index" <|
|
||||
match.span 1 . should_equal (Span.Value (0.up_to 6) input)
|
||||
|
||||
Test.specify "should get the span of a group by name" <|
|
||||
match.span "letters" . should_equal (Span.Value (6.up_to 18) input)
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.span 3 . should_equal Nothing
|
||||
match.span "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.span 5 . should_fail_with No_Such_Group
|
||||
match.span "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "Match.utf_16_span" <|
|
||||
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
|
||||
input = "aa ab abc a bc bcd"
|
||||
match = pattern.match input
|
||||
match . should_be_a Match
|
||||
|
||||
Test.specify "should get the UTF16 span of a group by index" <|
|
||||
match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
|
||||
|
||||
Test.specify "should get the UTF16 span of a group by name" <|
|
||||
match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
|
||||
|
||||
Test.specify "should return Nothing if the group didn't match" <|
|
||||
match.utf_16_span 3 . should_equal Nothing
|
||||
match.utf_16_span "empty" . should_equal Nothing
|
||||
|
||||
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
|
||||
match.utf_16_span 5 . should_fail_with No_Such_Group
|
||||
match.utf_16_span "nonexistent" . should_fail_with No_Such_Group
|
||||
|
||||
Test.group "caching" <|
|
||||
Test.specify "Replacer cache drops old values" <|
|
||||
pattern = Regex.compile('([a-c])')
|
||||
|
||||
# Add enough values to flush out the first values.
|
||||
0.up_to get_lru_size+1 . map i->
|
||||
result = pattern.replace "abcdef" ("$1$1x" + i.to_text)
|
||||
result . should_not_equal Nothing
|
||||
replacer_cache_lookup "$1$1x0" . should_equal Nothing
|
||||
replacer_cache_lookup "$1$1x1" . should_not_equal Nothing
|
||||
|
||||
main = Test_Suite.run_main spec
|
||||
|
@ -1,6 +1,6 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Regex_2.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
|
||||
import Standard.Base.Data.Text.Regex.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
|
||||
import Standard.Base.Data.Text.Span.Span
|
||||
import Standard.Base.Data.Text.Span.Utf_16_Span
|
||||
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
|
||||
@ -9,8 +9,6 @@ import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Base.IO
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
|
||||
from Standard.Base.Data.Text.Text_Sub_Range.Text_Sub_Range import all
|
||||
from Standard.Base.Data.Index_Sub_Range.Index_Sub_Range import all
|
||||
|
||||
|
@ -223,7 +223,6 @@ spec = Test.group "Vectors" <|
|
||||
["abab", "aaabaaaa"].filter (Filter_Condition.Like "_ba_") . should_equal ["abab"]
|
||||
["abab", "aaabaaaa"].filter (Filter_Condition.Like "%ba__%") . should_equal ["aaabaaaa"]
|
||||
["aaaa", "bbbbb", "[ab]aaaa"].filter (Filter_Condition.Like "[ab]%") . should_equal ["[ab]aaaa"]
|
||||
["a\Qa\Eabb", "aaabb"].filter (Filter_Condition.Like "_\Qa\Ea%") . should_equal ["a\Qa\Eabb"]
|
||||
["f.txt", "abc.*"].filter (Filter_Condition.Like "%.*") . should_equal ["abc.*"]
|
||||
["f.txt", "abc.*"].filter (Filter_Condition.Not_Like "%.*") . should_equal ["f.txt"]
|
||||
|
||||
|
@ -50,9 +50,7 @@ import project.Data.Regression_Spec
|
||||
|
||||
import project.Data.Text_Spec
|
||||
import project.Data.Text.Text_Sub_Range_Spec
|
||||
import project.Data.Text.Default_Regex_Engine_Spec
|
||||
import project.Data.Text.Encoding_Spec
|
||||
import project.Data.Text.Matching_Spec
|
||||
import project.Data.Text.Regex_Spec
|
||||
import project.Data.Text.Span_Spec
|
||||
import project.Data.Text.Utils_Spec
|
||||
@ -126,10 +124,8 @@ main = Test_Suite.run_main <|
|
||||
Problems_Spec.spec
|
||||
Range_Spec.spec
|
||||
Ref_Spec.spec
|
||||
Lazy_Spec.spec
|
||||
Default_Regex_Engine_Spec.spec
|
||||
Regex_Spec.spec
|
||||
Matching_Spec.spec
|
||||
Lazy_Spec.spec
|
||||
Runtime_Spec.spec
|
||||
Self_Type_Spec.spec
|
||||
Span_Spec.spec
|
||||
|
Loading…
Reference in New Issue
Block a user