Remove old (Java) Regex library and replace with new (Truffle) library. (#6195)

Remove old (Java) Regex library and replace with new (Truffle) library.
This commit is contained in:
GregoryTravis 2023-04-04 15:58:26 -04:00 committed by GitHub
parent 2531aeeece
commit d9bc5246ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 1231 additions and 3681 deletions

View File

@ -100,13 +100,11 @@ type Filter_Condition
Table operations, it can accept another column - then the corresponding
values from the source column and the provided column are checked.
! Known Bugs
There is a known bug in Java Regex where escape characters are not
handled properly in Unicode-normalized matching mode. Due to this
limitation, Unicode normalization has been disabled for this function,
so beware that some equivalent graphemes like 'ś' and 's\u0301' will
not be matched.
See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926
! Known Limitations.
The Truffle regex engine does not transparently handle normalization.
Due to this limitation, Unicode normalization has been disabled for
this function, so beware that some equivalent graphemes like 'ś' and
's\u0301' will not be matched.
Like pattern:Text
## Does the value not match the SQL pattern (Text only)?
@ -121,13 +119,11 @@ type Filter_Condition
Table operations, it can accept another column - then the corresponding
values from the source column and the provided column are checked.
! Known Bugs
There is a known bug in Java Regex where escape characters are not
handled properly in Unicode-normalized matching mode. Due to this
limitation, Unicode normalization has been disabled for this function,
so beware that some equivalent graphemes like 'ś' and 's\u0301' will
not be matched.
See https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926
! Known Limitations.
The Truffle regex engine does not transparently handle normalization.
Due to this limitation, Unicode normalization has been disabled for
this function, so beware that some equivalent graphemes like 'ś' and
's\u0301' will not be matched.
Not_Like pattern:Text
## Is the value contained in `values`?
@ -212,7 +208,4 @@ type Filter_Condition
## PRIVATE
sql_like_to_regex sql_pattern =
regex_pattern = Regex_Utils.sql_like_pattern_to_regex sql_pattern
## There is a bug with Java Regex in Unicode normalized mode (CANON_EQ) with quoting.
https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8032926
Once that bug is fixed, `match_ascii` may be set back to `False`.
Regex.compile regex_pattern dot_matches_newline=True match_ascii=True
Regex.compile regex_pattern

View File

@ -9,7 +9,6 @@ from project.Data.Boolean import Boolean, True, False
polyglot java import org.enso.base.Text_Utils
## Enso's text type.
Enso's text type is natively unicode aware, and will handle arbitrary

View File

@ -12,10 +12,9 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Data.Text.Encoding.Encoding
import project.Data.Text.Location.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
import project.Data.Text.Regex_2
import project.Data.Text.Regex_2.Regex_Syntax_Error
import project.Data.Text.Regex.Regex_Syntax_Error
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
@ -233,7 +232,7 @@ Text.characters self =
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
compiled_pattern.match self
## Finds all the matches of the regular expression `pattern` in `self`,
@ -260,7 +259,7 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
compiled_pattern.match_all self
## ALIAS Check Matches
@ -290,7 +289,7 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
compiled_pattern.matches self
## ALIAS Split Text
@ -348,7 +347,7 @@ Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_re
True -> case delimiter of
_ : Text ->
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
compiled_pattern = Regex.compile delimiter case_insensitive=case_insensitive
compiled_pattern.split self
_ : Vector ->
parenthesize s = "(?:" + s + ")"
@ -383,7 +382,7 @@ Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive use_re
Text.tokenize : Text -> Case_Sensitivity -> Vector Text
Text.tokenize self pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
compiled_pattern.tokenize self
## ALIAS Replace Text
@ -477,7 +476,7 @@ Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive o
Text_Utils.replace_spans self spans_array replacement
True ->
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
compiled_pattern = Regex.compile term case_insensitive=case_insensitive
compiled_pattern.replace self replacement only_first
## ALIAS Get Words

View File

@ -0,0 +1,17 @@
from Standard.Base import all
import project.Any.Any
import project.Data.Locale.Locale
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Errors.Common.Type_Error
import project.Meta
## PRIVATE
Assert that `text_maybe` is a Text, then call the action.
expect_text : Any -> Any -> Any ! Type_Error
expect_text text_maybe ~action = case text_maybe of
_ : Text -> action
_ ->
Error.throw (Type_Error.Error Text (Meta.type_of text_maybe) "text_maybe")

View File

@ -1,110 +0,0 @@
import project.Data.Numbers.Integer
import project.Data.Pair.Pair
import project.Data.Range.Extensions
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Problem_Behavior.Problem_Behavior
import project.Panic.Panic
import project.Panic.Wrapped_Dataflow_Error
from project.Data.Boolean import Boolean, True, False
## UNSTABLE
An error indicating that some criteria did not match any names in the input.
type No_Matches_Found
Error (criteria : Vector Text)
to_display_text : Text
to_display_text self =
"The criteria "+self.criteria.to_text+" did not match any names in the input."
## PRIVATE
match_criteria_implementation matcher objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning =
result = internal_match_criteria_implementation matcher objects criteria reorder name_mapper
unmatched_criteria = result.second
problems = if unmatched_criteria.is_empty then [] else
[No_Matches_Found.Error unmatched_criteria]
on_problems.attach_problems_after result.first problems
## PRIVATE
match_criteria_callback matcher objects criteria problem_callback reorder=False name_mapper=(x->x) =
result = internal_match_criteria_implementation matcher objects criteria reorder name_mapper
unmatched_criteria = result.second
problem_callback unmatched_criteria
result.first
type Match_Matrix
## PRIVATE
A helper type holding a matrix of matches.
Value matrix criteria objects
# Checks if the ith object is matched by any criterion.
is_object_matched_by_anything : Integer -> Boolean
is_object_matched_by_anything self i =
self.matrix.at i . any x->x
# Checks if the ith criterion matches any objects.
does_criterion_match_anything : Integer -> Boolean
does_criterion_match_anything self i =
self.matrix.map (col -> col.at i) . any x->x
## PRIVATE
Extracts the list of criteria that did not have any matches.
unmatched_criteria self =
checked_criteria = self.criteria.map_with_index j-> criterion->
has_matches = self.does_criterion_match_anything j
Pair.new has_matches criterion
checked_criteria.filter (p -> p.first.not) . map .second
## PRIVATE
Returns the list of criteria that match the ith object.
criteria_matching_object : Integer -> Vector
criteria_matching_object self i =
self.criteria.filter_with_index j-> _->
self.matrix . at i . at j
## PRIVATE
Returns the list of criteria indices that match the ith object.
criteria_indices_matching_object : Integer -> Vector
criteria_indices_matching_object self i =
(0.up_to self.criteria.length).filter j->
self.matrix . at i . at j
## PRIVATE
Generates a matrix specifying which criteria match which object.
The returned `match_matrix` satisfies the following condition:
`match_matrix . at i . at j` is `True` if and only if `objects.at i` matches
`criteria.at j`.
make_match_matrix matcher objects criteria object_name_mapper=(x->x) criterion_mapper=(x->x) =
matrix = objects.map obj->
criteria.map criterion->
matcher.match_single_criterion (object_name_mapper obj) (criterion_mapper criterion)
Match_Matrix.Value matrix criteria objects
## PRIVATE
internal_match_criteria_implementation matcher objects criteria reorder=False name_mapper=(x->x) = Panic.catch Wrapped_Dataflow_Error (handler = x-> x.payload.unwrap) <|
## TODO [RW] discuss: this line of code also shows an issue we had with ensuring input dataflow-errors are correctly propagated, later on we stopped doing that and testing for that as it was too cumbersome. Maybe it could be helped with an @Accepts_Error annotation similar to the one from the interpreter???
[matcher, objects, criteria, reorder, name_mapper] . each v->
Panic.rethrow (v.map_error Wrapped_Dataflow_Error.Error)
match_matrix = make_match_matrix matcher objects criteria name_mapper
unmatched_criteria = match_matrix.unmatched_criteria
# Selects object indices which satisfy the provided predicate.
select_matching_indices : (Integer -> Boolean) -> Vector Text
select_matching_indices matcher =
0.up_to objects.length . to_vector . filter matcher
selected_indices = case reorder of
True ->
nested_indices = 0.up_to criteria.length . map j->
is_object_matched_by_this_criterion i =
match_matrix.matrix.at i . at j
select_matching_indices is_object_matched_by_this_criterion
nested_indices.flat_map x->x . distinct
False ->
select_matching_indices match_matrix.is_object_matched_by_anything
result = selected_indices.map objects.at
Pair.new result unmatched_criteria

View File

@ -1,124 +1,63 @@
## This module contains the basic interface to the more advanced functionality
of Enso's regular expression engine.
TODO Examples
import project.Data.Boolean.Boolean
import project.Any.Any
import project.Data.Numbers.Integer
import project.Data.Text.Regex.Engine.Engine
import project.Data.Text.Prim_Text_Helper
import project.Data.Text.Regex.Pattern.Pattern
import project.Data.Text.Regex.Engine.Default
import project.Data.Text.Regex.Regex_Option.Regex_Option
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Common.Compile_Error
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
from project.Errors.Common import Syntax_Error
polyglot java import org.enso.base.Regex_Utils
## Compile the provided `expression` into a regex pattern that can be used for
matching.
Arguments
- expression: The text representing the regular expression that you want to
compile.
- engine: The regular expression engine to use. It defaults to Enso's
built-in one which has good performance and a full feature-set.
- match_ascii: Enables or disables pure-ASCII matching for the regex. If you
know your data only contains ASCII then you can enable this for a
performance boost on some regex engines.
compile. Must be non-empty.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
- dot_matches_newline: Enables or disables the dot matches newline option.
This specifies that the `.` special character should match everything
_including_ newline characters. Without this flag, it will match all
characters _except_ newlines.
- multiline: Enables or disables the multiline option. Multiline specifies
that the `^` and `$` pattern characters match the start and end of lines,
as well as the start and end of the input respectively.
- comments: Enables or disables the comments mode for the regular expression.
In comments mode, the following changes apply:
- Whitespace within the pattern is ignored, except when within a
character class or when preceded by an unescaped backslash, or within
grouping constructs (e.g. `(?...)`).
- When a line contains a `#`, that is not in a character class and is not
preceded by an unescaped backslash, all characters from the leftmost
such `#` to the end of the line are ignored. That is to say, they act
as _comments_ in the regex.
- extra_opts: Specifies additional options in a vector. This allows options
to be supplied and computed without having to break them out into arguments
to the function. Where these overlap with one of the flags (`match_ascii`,
`case_insensitive`, `dot_matches_newline`, `multiline` and `verbose`), the
flags take precedence.
! Boolean Flags and Extra Options
This function contains a number of arguments that are boolean flags that
enable or disable common options for the regex. At the same time, it also
provides the ability to specify options in the `extra_opts` argument.
Where one of the flags is _set_ (has the value `True` or `False`), the
value of the flag takes precedence over the value in `extra_opts` when
merging the options to the engine. The flags are _unset_ (have value
`Nothing`) by default.
If an empty regex is used, `compile` throws an Illegal_Argument error.
? Why Compile?
While many regex engines are able to cache ad-hoc patterns, it is often
useful to be able to manually retain a pattern that you have computed. This
function exists so you can hold onto the resultant `Pattern` object,
instead of immediately proceeding to match using it.
compile : Text -> Engine -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Vector Regex_Option -> Pattern ! Compile_Error
compile expression engine=Default.new match_ascii=Nothing case_insensitive=Nothing dot_matches_newline=Nothing multiline=Nothing comments=Nothing extra_opts=[] =
options_vec = from_flags match_ascii case_insensitive dot_matches_newline multiline comments extra_opts
engine.compile expression options_vec
compile : Text -> Boolean | Nothing -> Pattern ! Regex_Syntax_Error | Illegal_Argument
compile self expression case_insensitive=Nothing =
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
options_string = if case_insensitive == True then "usgi" else "usg"
## Escape the special characters in `expression` such that the result is a valid
literal pattern for the original string.
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
Pattern.Value internal_regex_object
## ADVANCED
Escape the special characters in `expression` such that the result is a
valid literal pattern for the original string.
Arguments:
- expression: The expression to escape metacharacters in.
- engine: The regular expression engine to use. It defaults to Enso's
built-in one which has good performance and a full feature-set.
! Matching Engines
Care should be taken to ensure that you use the same engine for escaping
and matching, as engine syntax may differ in certain cases.
escape : Text -> Engine -> Text
escape expression engine=Default.new = engine.escape expression
> Example
Turn a Text into a regex that matches that string exactly.
## PRIVATE
example_escape =
literal_string = "\!\.|abcde"
Regex.escape literal_string
escape : Text -> Text
escape self expression = Regex_Utils.regexQuote expression
Turns the options flags into a vector of options.
from_flags : Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Boolean | Nothing -> Vector Regex_Option -> Vector Regex_Option
from_flags match_ascii case_insensitive dot_matches_newline multiline comments extra_opts =
builder = Vector.new_builder
process_override : Boolean | Nothing -> Regex_Option -> Nothing
process_override param option = case param of
_ : Boolean -> if param then builder.append option
Nothing -> if extra_opts.contains option then builder.append option
process_override match_ascii Regex_Option.Ascii_Matching
process_override case_insensitive Regex_Option.Case_Insensitive
process_override dot_matches_newline Regex_Option.Dot_Matches_Newline
process_override multiline Regex_Option.Multiline
process_override comments Regex_Option.Comments
## Add any non-overridable options from extra_opts
extra_opts.each opt->
not_ascii = opt != Regex_Option.Ascii_Matching
not_insensitive = opt != Regex_Option.Case_Insensitive
not_dot_matches_newline = opt != Regex_Option.Dot_Matches_Newline
not_multiline = opt != Regex_Option.Multiline
not_comments = opt != Regex_Option.Comments
if not_ascii && not_insensitive && not_dot_matches_newline && not_multiline && not_comments then
builder.append opt
builder.to_vector
## PRIVATE
An error that is emitted when there is no such group in the match for the
## An error that is emitted when there is no such group in the match for the
provided `id`.
Arguments:
@ -134,46 +73,10 @@ type No_Such_Group
_ : Integer -> "No group exists with the index " + self.id.to_text + "."
_ : Text -> "No group exists with the name " + self.id + "."
## PRIVATE
An error representing that one of the passed options was invalid.
Arguments:
- opt: The option that was not valid for this regex engine.
type Invalid_Option
Error (opt : Any)
## A syntax error reported by the Truffle regex compiler.
type Regex_Syntax_Error
## PRIVATE
Provides a human-readable representation of the invalid option error.
to_display_text : Text
to_display_text self =
"The option " + self.opt.to_text + " is not valid for the default regex engine."
## PRIVATE
An error representing that there is something wrong with the mode for a regex
match.
Arguments:
- message: The text of the message to display to users.
type Mode_Error
Error (message : Text)
## PRIVATE
Provides a human-readable representation of the mode error.
to_display_text : Text
to_display_text self = self.message.to_text
## PRIVATE
An error representing that the bounds for a match are invalid.
type Invalid_Bounds_Error
## PRIVATE
Provides a human-readable representation of the invalid bounds error.
to_display_text : Text
to_display_text =
"The start bound cannot be greater than the end bound."
Arguments:
- message: A description of the erroneous syntax.
Error message

View File

@ -1,51 +0,0 @@
## An `Engine` is a configuration and behaviour specification object for a
particular regular expression engine.
An implementation of a regular expression engine must implement the below
interface, as well as conform to the following requirements:
- The engine must operate in a unicode mode by default, using canonical
form for equality and the unicode versions of the standard character
classes.
- It must support the standard options specified in
`Standard.Base.Data.Text.Regex.Regex_Option`. It may specify additional,
engine-specific options, but this is not required by the specification.
- In the defining module, the engine implementation must provide a full
specification of its syntax in the module documentation block.
This file is _not executable_. It instead describes the interface for the
customisable `Engine` and `Pattern` types.
import project.Data.Text.Text
import project.Data.Text.Regex.Regex_Option.Regex_Option
import project.Data.Text.Regex.Invalid_Option
import project.Data.Text.Regex.Pattern.Pattern
import project.Data.Vector.Vector
import project.Errors.Common.Compile_Error
import project.Errors.Unimplemented.Unimplemented
## The `Data.Text.Regex.Engine.Engine` interface.
type Engine
## PRIVATE
Compile the provided `expression` into a regex pattern that can be used
for matching.
Arguments
- expression: The text representing the regular expression that you want
to compile.
- options: The options to configure the matching process with. These are
merged with the specific `engine_opts`.
compile : Text -> Vector Regex_Option -> Pattern ! (Compile_Error | Invalid_Option)
compile self _ _ = Unimplemented.throw "This is an interface only."
## PRIVATE
Escape the special characters in `expression` such that the result is a
valid literal pattern for the original string.
Arguments:
- expression: The expression to escape metacharacters in.
escape : Text -> Text
escape self _ = Unimplemented.throw "This is an interface only."

View File

@ -1,888 +0,0 @@
## Enso's default regular expression matching engine.
Enso's default regular expression engine uses Java's regular expression
syntax, extended with support for the unicode character classes and
properties. A detailed explanation of the syntax is below.
! Raw Strings
Enso has support for raw strings using the `""` quotes. Within a raw
string, all characters are interpreted to mean themselves. This means that
you do not need to double-escape special characters in regular expressions.
! Characters and Regex
When the default regex engine provdies a position with regards to
"characters", it is referring to positions in terms of the UTF-16
characters in the text. These indices must be used to index into the
vector of UTF-16 characters. It will otherwise be wrong.
! Escaping
The backslash character `"\"` serves to introduce escaped constructs, as
defined in "Syntax Specification" below, as well as to quote characters
that would otherwise be interpreted as unescaped constructs. As a result,
the expression `"\\"` matches a single backslash, and `"\{"` matches an
opening brace.
It is a parse error for the regular expression to use a backslash prior to
any alphabetic character that does not denote an escaped construct. It is,
however, valid to put a backslash before any symbolic character.
? Syntax Specification
The syntax supported by the default regular expression engine is described
here. The pattern described by the regular expression can then be used to
match against text.
TBC
import project.Any.Any
import project.Data.Map.Map
import project.Data.Numbers.Integer
import project.Data.Range.Extensions
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex.Invalid_Option
import project.Data.Text.Regex.Invalid_Bounds_Error
import project.Data.Text.Regex.Mode_Error
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
import project.Data.Text.Regex.Regex_Option.Regex_Option
import project.Data.Text.Text
import project.Data.Text.Span.Utf_16_Span
import project.Data.Vector.Vector
import project.Meta
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
from project.Errors.Common import Compile_Error, Syntax_Error
polyglot java import java.lang.IllegalArgumentException
polyglot java import java.lang.IndexOutOfBoundsException
polyglot java import java.lang.StringBuffer
polyglot java import java.util.regex.Matcher as Java_Matcher
polyglot java import java.util.regex.Pattern as Java_Pattern
polyglot java import java.util.regex.PatternSyntaxException
polyglot java import com.ibm.icu.impl.UnicodeRegex
polyglot java import org.enso.base.Regex_Utils
polyglot java import org.enso.base.Text_Utils
## Construct an instance of the default engine.
Arguments:
- opts: Any engine-specific options.
> Example
Build a new default engine specifying literal mode.
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
example_new =
engine_opts = [Default_Engine.Option.Literal_Pattern]
Default_Engine.new engine_opts
new : Vector (Regex_Option | Option) -> Default_Engine
new opts=[] = Default_Engine.Value opts
## The default implementation of the `Data.Text.Regex.Engine.Engine` interface.
type Default_Engine
## PRIVATE
The default regex engine for Enso.
Arguments:
- engine_opts: Options for regex matching that are specific to this
engine.
Value (engine_opts : Vector (Regex_Option | Option))
## ADVANCED
Compile the provided `expression` into a regex pattern that can be used
for matching.
Arguments
- expression: The text representing the regular expression that you want
to compile.
- options: The options to configure the matching process with. These are
merged with the specific `engine_opts`.
? Why Compile?
While many regex engines are able to cache ad-hoc patterns, it is often
useful to be able to manually retain a pattern that you have computed.
This function exists so you can hold onto the resultant `Pattern`
object, instead of immediately proceeding to match using it.
> Example
Compile the regex `"^a$"` in multiline mode so it matches all lines
consisting of a single "a".
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
example_compile =
expression = "^a$"
options = [Regex_Option.Multiline]
engine = Default_Engine.new
engine.compile expression options
compile : Text -> Vector (Regex_Option | Option) -> Pattern ! (Compile_Error | Invalid_Option)
compile self expression options =
all_options = options + self.engine_opts
options_bitmask = from_enso_options all_options
unicode_regex = UnicodeRegex.new
maybe_java_pattern = Panic.recover Any <|
Java_Pattern.compile (unicode_regex.transform expression) options_bitmask
internal_pattern = maybe_java_pattern.map_error case _ of
err : PatternSyntaxException -> Syntax_Error.Error ("The regex could not be compiled: " + err.getMessage)
other -> other
Pattern.Value internal_pattern all_options self
## ADVANCED
Escape the special characters in `expression` such that the result is a
valid literal pattern for the original string.
Arguments:
- expression: The expression to escape metacharacters in.
> Example
Turn a literal string into a regex that matches that string exactly.
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
example_escape =
literal_string = "\!\.|abcde"
engine = Default_Engine.new
engine.escape literal_string
escape : Text -> Text
escape self expression = Java_Pattern.quote expression
## The default implementation of the `Data.Text.Regex.Engine.Pattern` interface.
type Pattern
## PRIVATE
The default pattern type for Enso, produced by the default regex engine.
Arguments:
- internal_pattern: The internal representation of the compiled pattern.
- options: The vector of options with which this pattern was built.
- engine: A handle to the engine that built this pattern.
Value (internal_pattern : Java_Pattern) (options : Vector (Regex_Option | Option)) (engine : Default_Engine)
## PRIVATE
Constructs an internal matcher, settings the region as provided and
handling some additional options.
Arguments:
- input: The text on which it will be matching.
- start: The start of the matcher's region.
- end: The end of the matcher's region.
! Unicode Normalization
The Regex engine used here handles string modifiers, like accents in a
weird way. The string "s\u{301}" will be treated as containing "s"
within it, but "ś" (which is canonically equivalent to the former one)
will not contain "s". To get consistent behavior that does not depend
on the encoding, we normalize all input.
build_matcher : Text -> Integer -> Integer -> Java_Matcher
build_matcher self input start end =
## TODO [RW] Normalization had to be disabled - since start and end are
in code unit space, normalization could shift these indices!
This should be addressed when reviewing
See: https://www.pivotaltracker.com/story/show/181524498
#normalized_input = if self.options.contains Regex_Option.Ascii_Matching then input else
# Text_Utils.normalize input
normalized_input = input
internal_matcher = self.internal_pattern.matcher normalized_input . region start end
if self.options.contains Option.No_Anchoring_Bounds then
internal_matcher.useAnchoringBounds False
if self.options.contains Option.Transparent_Bounds then
internal_matcher.useTransparentBounds True
internal_matcher
## ADVANCED
Tries to match the provided `input` against the pattern `self`.
Arguments:
- input: The text to match the pattern described by `self` against.
- mode: The matching mode to use.
This method will _always_ return `Nothing` if it fails to match.
? Return Type
When asked to match in a mode that can only provide a single match, the
return type is either a single `Match` object. When asked to match in a
mode that permits multiple matches, it will always return a `Vector`,
even if only a single match is found.
> Example
Match the first instance of the pattern `".."` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "abcdefghij"
pattern.match input mode=Matching_Mode.First
> Example
Match up to the first 3 instances of the pattern `".."` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "abcdefghij"
pattern.match input mode=3
> Example
Match all instances of the pattern `".."` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "abcdefghij"
pattern.match input
> Example
Check if the pattern `".*"` matches on the entire input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".*" []
input = "abcdefghij"
pattern.match input mode=Regex_Mode.Full
match : Text -> (Regex_Mode | Matching_Mode) -> Match | Vector Match | Nothing
match self input mode=Regex_Mode.All =
do_match_mode mode start end = case mode of
Matching_Mode.First ->
internal_matcher = self.build_matcher input start end
if internal_matcher . find start . not then Nothing else
Match.Value internal_matcher start end input
_ : Integer ->
if mode < 0 then Panic.throw <|
Mode_Error.Error "Cannot match a negative number of times."
builder = Vector.new_builder
go : Integer -> Integer -> Nothing
go offset remaining_count =
should_continue = remaining_count > 0
if should_continue.not || (offset >= end) then Nothing else
internal_matcher = self.build_matcher input start end
found = internal_matcher.find offset
if found.not then Nothing else
builder.append (Match.Value internal_matcher start end input)
match_end = internal_matcher.end 0
# Ensure progress even if the match is an empty string.
new_offset = if match_end > offset then match_end else offset+1
@Tail_Call go new_offset remaining_count-1
go start mode
vector = builder.to_vector
if vector.is_empty then Nothing else vector
Regex_Mode.All ->
builder = Vector.new_builder
go : Integer -> Nothing
go offset =
if offset >= end then Nothing else
internal_matcher = self.build_matcher input start end
found = internal_matcher.find offset
if found.not then Nothing else
builder.append (Match.Value internal_matcher start end input)
match_end = internal_matcher.end 0
# Ensure progress even if the match is an empty string.
new_offset = if match_end > offset then match_end else offset+1
@Tail_Call go new_offset
go start
vector = builder.to_vector
if vector.is_empty then Nothing else vector
Regex_Mode.Full ->
internal_matcher = self.build_matcher input start end
if internal_matcher.matches.not then Nothing else
Match.Value internal_matcher start end input
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error.Error "Modes cannot be recursive."
case mode of
Regex_Mode.Bounded start end sub_mode ->
if start < end then do_match_mode sub_mode start end else
Panic.throw Invalid_Bounds_Error
_ -> do_match_mode mode 0 (Text_Utils.char_length input)
## ADVANCED
Returns `True` if the input matches against the pattern described by
`self`, otherwise `False`.
Arguments:
- input: The text to check for matching.
> Example
Check if the input "aa" matches against the pattern `".."`.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "aa"
pattern.matches input
matches : Text -> Boolean
matches self input = case self.match input mode=Regex_Mode.Full of
_ : Match -> True
_ : Vector -> True
_ -> False
## ADVANCED
Tries to find the text in the `input` that matches against the pattern
`self`.
Arguments:
- input: The text to find matches in.
- mode: The matching mode to use.
This method will _always_ return `Nothing` if it fails to find any
matches.
? Return Type
When asked to match in a mode that can only provide a single match, the
return type is either a single `Match` object. When asked to match in a
mode that permits multiple matches, it will always return a `Vector`,
even if only a single match is found.
> Example
Find the first instance of the pattern `".."` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "abcdefghij"
pattern.find input mode=Matching_Mode.First
> Example
Find up to the first 3 instances of the pattern `".."` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "abcdefghij"
pattern.find input mode=3
> Example
Find all instances of the pattern `".."` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "abcdefghij"
pattern.find input
> Example
Find if the pattern `".*"` matches on the entire input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".*" []
input = "abcdefghij"
pattern.find input mode=Regex_Mode.Full
find : Text -> (Regex_Mode | Matching_Mode) -> Text | Vector Text | Nothing
find self input mode=Regex_Mode.All =
matches = self.match input mode
case matches of
_ : Match -> matches.group 0
_ : Vector -> matches.map (_.group 0)
_ -> matches
## ADVANCED
Splits the `input` text based on the pattern described by `self`.
Arguments:
- input: The text to splut based on the pattern described by `self`.
- mode: The splitting mode to use.
This method will _always_ return a vector. If no splits take place, the
vector will contain a single element.
> Example
Split the input on the first instance of the pattern `"aa"`.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile "aa" []
input = "abaaabbaabba"
pattern.match input mode=Matching_Mode.First
> Example
Split on up to the first 3 instances of the pattern `"a"` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile "a" []
input = "bacadaeaf"
pattern.match input mode=3
> Example
Split on all all instances of the pattern `"a"` in the input.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile "a" []
input = "bacadaeaf"
pattern.match input
split : Text -> Matching_Mode | Regex_Mode | Integer -> Vector Text
split self input mode=Regex_Mode.All =
# Java uses this to mean the max length of the resulting array, so we
# add 1.
limit = case mode of
Matching_Mode.First -> 2
_ : Integer ->
if mode < 0 then Panic.throw <|
Mode_Error.Error "Cannot match a negative number of times."
mode + 1
Regex_Mode.All -> -1
Regex_Mode.Full -> Panic.throw <|
Mode_Error.Error "Splitting on a full match yields an empty text."
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error.Error "Splitting on a bounded region is not well-defined."
Matching_Mode.Last -> Panic.throw <|
Mode_Error.Error "Splitting on the last match is not supported."
splits = self.internal_pattern.split input limit
Vector.from_polyglot_array splits
## ADVANCED
Replace all occurrences of the pattern described by `self` in the `input`
with the specified `replacement`.
Arguments:
- input: The text in which to perform the replacement(s).
- replacement: The literal text with which to replace any matches.
- mode: The matching mode to use for finding candidates to replace.
If this method performs no replacements it will return the `input` text
unchanged.
> Example
Replace the first occurrence of the pattern `".."` in the input with
the text `"REPLACED"`.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile ".." []
input = "abcdefghij"
pattern.replace input "REPLACED" mode=Matching_Mode.First
> Example
Replace up to the first 3 instances of the pattern `"aa"` in the input
with the text `"REPLACED"`.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile "aa" []
input = "aabbaaaabb"
pattern.replace input "REPLACED" mode=3
> Example
Replace all instances of the pattern `"aa"` in the input with the text
`"REPLACED"`.
import Standard.Base.Data.Text.Regex.Engine.Default
example_match =
engine = Default.new
pattern = engine.compile "aa []
input = "aabbaabbbbbaab"
pattern.replace input "REPLACED"
replace : Text -> Text -> Regex_Mode | Matching_Mode | Integer -> Text
replace self input replacement mode=Regex_Mode.All =
do_replace_mode mode start end = case mode of
Matching_Mode.First ->
internal_matcher = self.build_matcher input start end
internal_matcher.replaceFirst replacement
_ : Integer ->
if mode < 0 then Panic.throw <|
Mode_Error.Error "Cannot replace a negative number of times."
internal_matcher = self.build_matcher input start end
buffer = StringBuffer.new
go remaining_replacements =
if (internal_matcher.find) && (remaining_replacements > 0) then
internal_matcher.appendReplacement buffer replacement
@Tail_Call go (remaining_replacements - 1)
go mode
internal_matcher.appendTail buffer
buffer.to_text
Regex_Mode.All ->
internal_matcher = self.build_matcher input start end
internal_matcher.replaceAll replacement
Regex_Mode.Full ->
case self.match input mode=Regex_Mode.Full of
_ : Match -> self.replace input replacement Matching_Mode.First
Nothing -> input
Matching_Mode.Last ->
all_matches = self.match input
all_matches_count = if all_matches.is_nothing then 0 else all_matches.length
if all_matches_count == 0 then input else
internal_matcher = self.build_matcher input start end
buffer = StringBuffer.new
last_match_index = all_matches_count - 1
go match_index =
internal_matcher.find
case match_index == last_match_index of
True -> internal_matcher.appendReplacement buffer replacement
False -> @Tail_Call go (match_index + 1)
go 0
internal_matcher.appendTail buffer
buffer.to_text
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error.Error "Modes cannot be recursive."
case mode of
Regex_Mode.Bounded _ _ _ -> Panic.throw <|
Mode_Error.Error "Bounded replacements are not well-formed."
_ -> do_replace_mode mode 0 (Text_Utils.char_length input)
## The default implementation of the `Data.Text.Regex.Engine.Match` interface.
type Match
## PRIVATE
A representation of a regular expression match.
Arguments:
- internal_match: The internal representation of the regular expression
match.
- region_start: The start of the region over which the match was made.
- region_end: The end of the region over which the match was made.
- input: The input text that was being matched.
Value (internal_match : Java_Matcher) (region_start : Integer) (region_end : Integer) (input : Text)
## Gets the text matched by the group with the provided identifier, or
`Nothing` if the group did not participate in the match. If no such group
exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- id: The index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
> Example
Get the text of the group with the index 0.
import Standard.Examples
example_group =
match = Examples.match
match.group 0
> Example
Get the text of the group with the name "letters".
import Standard.Examples
example_group =
match = Examples.match
match.group "letters"
group : Integer | Text -> Text | Nothing ! No_Such_Group
group self id =
Panic.recover Any (self.internal_match.group id) . map_error (handle_error _ id)
## Gets a vector containing the results of _all_ of the capturing groups in
the pattern, replacing the value of groups that did not participate in
the match with `default`.
Arguments:
- default: The value to return for a given index when the group at that
index did not participate in the match.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
> Example
Get a vector of the text matched by all of the groups in this match,
replacing the value for groups that didn't match with "UNMATCHED".
import Standard.Examples
example_groups =
match = Examples.match
match.groups default="UNMATCHED"
groups : Any -> Vector (Text | Any)
groups self default=Nothing =
group_numbers = 0.up_to self.internal_match.groupCount+1
group_numbers.map n->
case self.group n of
Nothing -> default
a -> a
## Gets a map containing the named capturing groups for the pattern,
replacing the value for groups that did not participate in the match with
`default`.
Arguments:
- default: The value to return for a given name when the group at that
index did not participate in the match.
> Example
Get the map of all of the named groups in this match, replacing the
value for groups that didn't match with "UNMATCHED".
import Standard.Examples
example_groups =
match = Examples.match
matcg.named_groups default="UNMATCHED"
named_groups : Any -> Map Text (Text | Any)
named_groups self default=Nothing =
group_names = Vector.from_polyglot_array <|
Regex_Utils.get_group_names self.internal_match.pattern
pairs = group_names.map name->
value = case self.group name of
Nothing -> default
a -> a
[name, value]
Map.from_vector pairs
## Gets the index of the first character captured by the group with the
given identifier, or `Nothing` if the group did not participate in the
match.
Arguments:
- id: The identifier for the group to fetch the start index for.
! What is a Character?
This regular expression engine defines a "character" to mean a UTF-16
character. This means that these indices should only be used with the
result of calling `.char_vector` on the text. Using them with
`.characters` or `.codepoints` will produce incorrect results.
> Example
Get the start index in the input where the full pattern matched for
this match.
import Standard.Examples
example_start =
match = Examples.match
match.start 0
start : Integer | Text -> Integer | Nothing ! No_Such_Group
start self id =
result = Panic.recover Any (self.internal_match.start id)
no_errors = result.map_error (handle_error _ id)
if no_errors == -1 then Nothing else no_errors
## Gets the index of the first character after `start` that was not captured
by the group with the given identifier, or `Nothing` if the group did not
participate in the match.
Arguments:
- id: The identifier for the group to fetch the end index for.
! What is a Character?
This regular expression engine defines a "character" to mean a UTF-16
character. This means that these indices should only be used with the
result of calling `.char_vector` on the text. Using them with
`.characters` or `.codepoints` will produce incorrect results.
> Example
Get the end index in the input where the full pattern matched for this
match.
import Standard.Examples
example_end =
match = Examples.match
match.end 0
end : Integer | Text -> Integer | Nothing ! No_Such_Group
end self id =
result = Panic.recover Any (self.internal_match.end id)
no_errors = result.map_error (handle_error _ id)
if no_errors == -1 then Nothing else no_errors
## Returns the span matched by the group with the provided identifier, or
`Nothing` if the group did not participate in the match.
Arguments:
- id: The identifier for the group to fetch the end index for.
! What is a Character?
This regular expression engine defines a "character" to mean a UTF-16
character. This means that these indices should only be used with the
result of calling `.char_vector` on the text. Using them with
`.characters` or `.codepoints` will produce incorrect results.
> Example
Get the span over the input that was matched by the full match.
import Standard.Examples
example_Span =
match = Examples.match
match.span 0
span : Integer | Text -> Utf_16_Span | Nothing ! No_Such_Group
span self id = case self.group id of
Nothing -> Nothing
_ -> Utf_16_Span.Value ((self.start id).up_to (self.end id)) self.input
## Returns the start character index of the match's region.
! What is a Character?
This regular expression engine defines a "character" to mean a UTF-16
character. This means that these indices should only be used with the
result of calling `.char_vector` on the text. Using them with
`.characters` or `.codepoints` will produce incorrect results.
> Example
Get the start position in the input to which this match was limited.
import Standard.Examples
example_start_position =
match = Examples.match
match.start_position
start_position : Integer
start_position self = self.region_start
## Returns the end character index of the match's region.
! What is a Character?
This regular expression engine defines a "character" to mean a UTF-16
character. This means that these indices should only be used with the
result of calling `.char_vector` on the text. Using them with
`.characters` or `.codepoints` will produce incorrect results.
> Example
Get the end position in the input to which this match was limited.
import Standard.Examples
example_end_position =
match = Examples.match
match.end_position
end_position : Integer
end_position self = self.region_end
## PRIVATE
Handle errors when looking up group info.
Arguments:
- error: The error as a value.
- id: The group identifier with which the error is associated.
handle_error : Any -> (Text | Integer) -> Any
handle_error error id = case error of
_ : IndexOutOfBoundsException -> No_Such_Group.Error id
_ : IllegalArgumentException -> No_Such_Group.Error id
other -> other
## Options specific to the `Default` regular expression engine.
type Option
## Specifies that the input expression to the pattern be treated as a
sequence of literal characters. Metacharacters and escape sequences have
no special meaning in this mode.
Literal_Pattern
## Disables anchoring to the region's boundaries.
By default, the regex engine will allow `^` and `$` to match the
boundaries of a restricted region. With this option specified, they will
only match the start and end of the input.
No_Anchoring_Bounds
## Enables transparent bounds.
Setting this option will allow the regex engine to look "through" the
boundaries of the engine's region for the purposes of lookahead,
lookbehind, and boundary matching.
Without this flag, the region boundaries are treated as opaque, meaning
that the above constructs will fail to match anything outside the region.
Transparent_Bounds
## Specifies that only the unix line ending `''\n'` be considered in the
behaviour of the `^` and `$` special characters.
Unix_Lines
## PRIVATE
Generates a Java bitmask representing the options used to configure the
regex.
Arguments:
- opts: The enso-side options to configure the regex.
from_enso_options : Vector (Option | Regex_Option) -> Integer
from_enso_options opts =
java_flags = Panic.recover Any <| opts.flat_map case _ of
Option.Literal_Pattern -> [Java_Pattern.LITERAL]
Option.Unix_Lines -> [Java_Pattern.UNIX_LINES]
Option.No_Anchoring_Bounds -> []
Option.Transparent_Bounds -> []
Regex_Option.Case_Insensitive -> [Java_Pattern.CASE_INSENSITIVE]
Regex_Option.Dot_Matches_Newline -> [Java_Pattern.DOTALL]
Regex_Option.Multiline -> [Java_Pattern.MULTILINE]
Regex_Option.Comments -> [Java_Pattern.COMMENTS]
Regex_Option.Ascii_Matching -> []
other -> Panic.throw (Invalid_Option.Error other)
options_bitmask = java_flags.fold 0 .bit_or
if opts.contains Regex_Option.Ascii_Matching then options_bitmask else
unicode = [Java_Pattern.CANON_EQ, Java_Pattern.UNICODE_CASE, Java_Pattern.UNICODE_CHARACTER_CLASS].fold 0 .bit_or
options_bitmask.bit_or unicode

View File

@ -1,24 +1,143 @@
import project.Any.Any
import project.Data.Map.Map
import project.Data.Numbers.Integer
import project.Data.Text.Span.Span
import project.Data.Text.Text
import project.Data.Range.Extensions
import project.Data.Range.Range
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Unimplemented.Unimplemented
import project.Error.Error
import project.Errors.Common.Index_Out_Of_Bounds
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
## The `Data.Text.Regex.Engine.Match` interface.
type Match
## PRIVATE
internal_regex_result : RegexResult (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (pattern : Pattern) (internal_regex_result : Any) (input : Text)
## PRIVATE
Returns the start UTF16 character index of a group.
Gets the text matched by the group with the provided identifier, or
`Nothing` if the group did not participate in the match. If no such group
This method goes directly to the internal match object. It does not
take group names, and does not have a default.
Arguments:
- group: the integer group number.
internal_start : Integer -> Integer
internal_start self group = self.internal_regex_result.getStart group
## PRIVATE
Returns the end UTF16 character index, plus one, of a group.
This method goes directly to the internal match object. It does not
take group names, and does not have a default.
Arguments:
- group: the integer group number.
internal_end : Integer -> Integer
internal_end self group = self.internal_regex_result.getEnd group
## Returns the start UTF16 character index of a group.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
utf_16_start : Integer | Text -> Integer
utf_16_start self group=0 =
span = self.utf_16_span group
if span.is_nothing then Nothing else span.start
## Returns the end UTF16 character index, plus one, of a group.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
utf_16_end : Integer | Text -> Integer
utf_16_end self group=0 =
span = self.utf_16_span group
if span.is_nothing then Nothing else span.end
## Returns the start grapheme index of a group.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
start : Integer | Text -> Integer
start self group=0 =
span = self.span group
if span.is_nothing then Nothing else span.start
## Returns the end grapheme index, plus one, of a group.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
end : Integer | Text -> Integer
end self group=0 =
span = self.span group
if span.is_nothing then Nothing else span.end
## Gets the UTF16 span matched by the group with the provided identifier, or
a default value if the group did not participate in the match. If no such
group exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- group: The integer index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
Match.utf_16_span will return the default value.
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
utf_16_span self group=0 ~default=Nothing =
group_id = self.pattern.lookup_group group
start = self.internal_start group_id
end = self.internal_end group_id
does_not_participate = start == -1 || end == -1
if does_not_participate then default else
range = Range.new start end
Utf_16_Span.Value range self.input
## Gets the grapheme span matched by the group with the provided identifier, or
a default value if the group did not participate in the match. If no such group
exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- id: The index or name of that group.
- group: The integer index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
@ -26,19 +145,62 @@ type Match
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
group : Integer | Text -> Text | Nothing ! No_Such_Group
group self _ = Unimplemented.throw "This is an interface only."
## PRIVATE
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Gets a vector containing the results of _all_ of the capturing groups in
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get
group 3, Match.span will return the default value.
span : Integer | Text -> Any -> Span ! No_Such_Group
span self group=0 ~default=Nothing =
result = self.utf_16_span group Nothing
if result.is_nothing then default else result.to_grapheme_span
## Gets the Text matched by the group with the provided identifier, or
a default value if the group did not participate in the match. If no such
group exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- group: The integer index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get
group 3, Match.text will return the default value.
text : Integer | Text -> Any -> Text ! No_Such_Group
text self group=0 ~default=Nothing =
result = self.span group Nothing
if result.is_nothing then default else result.text
## Gets a vector containing the Text of _all_ of the capturing groups in
the pattern, replacing the value of groups that did not participate in
the match with `default`.
the match with `default`. This vector includes group 0, which contains
the entire match.
Arguments:
- default: The value to return for a given index when the group at that
index did not participate in the match. The default for this argument
should be `Nothing`.
index did not participate in the match.
? The Full Match
The group with index 0 is always the full match of the pattern.
@ -46,60 +208,81 @@ type Match
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. `groups` will return the
default value for groups that do not participate.
> Example
Get a vector of the text matched by all of the groups in this match,
replacing the value for groups that didn't match with "UNMATCHED".
import Standard.Examples
example_groups =
match = Examples.match
match.groups default="UNMATCHED"
groups : Any -> Vector (Text | Any)
groups self _ = Unimplemented.throw "This is an interface only."
groups self ~default=Nothing =
group_numbers = 0.up_to self.pattern.group_count
group_numbers.map n-> (self.text n . if_nothing default)
## PRIVATE
Gets a map containing the named capturing groups for the pattern,
## Gets a map containing the named capturing groups for the pattern,
replacing the value for groups that did not participate in the match with
`default`.
Arguments:
- default: The value to return for a given name when the group at that
index did not participate in the match. This should default to
`Nothing`.
index did not participate in the match.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. `named_groups` will map
a named group that does not participate to the default value.
> Example
Get the map of all of the named groups in this match, replacing the
value for groups that didn't participate in the match with "UNMATCHED".
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
## match.named_groups.keys.sort == ["empty", "letters"]
named_groups : Any -> Map Text (Text | Any)
named_groups self _ = Unimplemented.throw "This is an interface only."
named_groups self default=Nothing =
named_group_names = self.pattern.group_names
spans = named_group_names.map name-> self.text name default=default
Map.from_vector (named_group_names.zip spans)
## PRIVATE
Gets the index of the first character captured by the group with the
given identifier, or `Nothing` if the group did not participate in the
match.
## Gets the grapheme span matched by the group with the provided index, or
a default value if the group did not participate in the match.
If the identifier is invalid then `if_missing` is returned.
Arguments:
- id: The identifier for the group to fetch the start index for.
start : Integer | Text -> Integer | Nothing ! No_Such_Group
start self _ = Unimplemented.throw "This is an interface only."
- id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds.
get : Integer -> Any -> Text | Any
get self index ~if_missing=Nothing =
self.text index . catch No_Such_Group (_-> if_missing)
## PRIVATE
Gets the index of the first character after `start` that was not captured
by the group with the given identifier, or `Nothing` if the group did not
participate in the match.
## Gets the grapheme span matched by the group with the provided index, or
a default value if the group did not participate in the match.
If the identifier is invalid then Index_Out_Of_Bounds is thrown.
Arguments:
- id: The identifier for the group to fetch the end index for.
end : Integer | Text -> Integer | Nothing ! No_Such_Group
end self _ = Unimplemented.throw "This is an intercace only."
## PRIVATE
Returns the span matched by the group with the provided identifier, or
`Nothing` if the group did not participate in the match.
Arguments:
- id: The identifier for the group to fetch the end index for.
span : Integer | Text -> Span | Nothing ! No_Such_Group
span self _ = Unimplemented.throw "This is an interface only."
## PRIVATE
Returns the start character index of the match's region.
start_position : Integer
start_position self = Unimplemented.throw "This is an interface only."
## Returns the end character index of the match's region.
end_position : Integer
end_position self = Unimplemented.throw "This is an interface only."
- id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds.
at : Integer -> Text ! Index_Out_Of_Bounds
at self index =
self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count))

View File

@ -1,287 +0,0 @@
import project.Any.Any
import project.Data.Map.Map
import project.Data.Numbers.Integer
import project.Data.Range.Extensions
import project.Data.Range.Range
import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Common.Index_Out_Of_Bounds
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
type Match_2
## internal_regex_result : RegexResult (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (pattern : Pattern_2) (internal_regex_result : Any) (input : Text)
## PRIVATE
Returns the start UTF16 character index of a group.
This method goes directly to the internal match object. It does not
take group names, and does not have a default.
Arguments:
- group: the integer group number.
internal_start : Integer -> Integer
internal_start self group = self.internal_regex_result.getStart group
## PRIVATE
Returns the end UTF16 character index, plus one, of a group.
This method goes directly to the internal match object. It does not
take group names, and does not have a default.
Arguments:
- group: the integer group number.
internal_end : Integer -> Integer
internal_end self group = self.internal_regex_result.getEnd group
## Returns the start UTF16 character index of a group.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
utf_16_start : Integer | Text -> Integer
utf_16_start self group=0 =
span = self.utf_16_span group
if span.is_nothing then Nothing else span.start
## Returns the end UTF16 character index, plus one, of a group.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
utf_16_end : Integer | Text -> Integer
utf_16_end self group=0 =
span = self.utf_16_span group
if span.is_nothing then Nothing else span.end
## Returns the start grapheme index of a group.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
start : Integer | Text -> Integer
start self group=0 =
span = self.span group
if span.is_nothing then Nothing else span.start
## Returns the end grapheme index, plus one, of a group.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Arguments:
- group: the group name or number. Marked groups defined in the regex are
numbered starting at 1; group 0 refers to the entire match.
end : Integer | Text -> Integer
end self group=0 =
span = self.span group
if span.is_nothing then Nothing else span.end
## Gets the UTF16 span matched by the group with the provided identifier, or
a default value if the group did not participate in the match. If no such
group exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- group: The integer index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
Match_2.utf_16_span will return the default value.
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
utf_16_span self group=0 ~default=Nothing =
group_id = self.pattern.lookup_group group
start = self.internal_start group_id
end = self.internal_end group_id
does_not_participate = start == -1 || end == -1
if does_not_participate then default else
range = Range.new start end
Utf_16_Span.Value range self.input
## Gets the grapheme span matched by the group with the provided identifier, or
a default value if the group did not participate in the match. If no such group
exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- group: The integer index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
! What is a Character?
A character is defined as an Extended Grapheme Cluster, see Unicode
Standard Annex 29. This is the smallest unit that still has semantic
meaning in most text-processing applications.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
group 3, Match_2.span will return the default value.
span : Integer | Text -> Any -> Span ! No_Such_Group
span self group=0 ~default=Nothing =
result = self.utf_16_span group Nothing
if result.is_nothing then default else result.to_grapheme_span
## Gets the Text matched by the group with the provided identifier, or
a default value if the group did not participate in the match. If no such
group exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- group: The integer index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get
group 3, Match_2.text will return the default value.
text : Integer | Text -> Any -> Text ! No_Such_Group
text self group=0 ~default=Nothing =
result = self.span group Nothing
if result.is_nothing then default else result.text
## Gets a vector containing the Text of _all_ of the capturing groups in
the pattern, replacing the value of groups that did not participate in
the match with `default`. This vector includes group 0, which contains
the entire match.
Arguments:
- default: The value to return for a given index when the group at that
index did not participate in the match.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. `groups` will return the
default value for groups that do not participate.
> Example
Get a vector of the text matched by all of the groups in this match,
replacing the value for groups that didn't match with "UNMATCHED".
import Standard.Examples
example_groups =
match = Examples.match
match.groups default="UNMATCHED"
groups : Any -> Vector (Text | Any)
groups self ~default=Nothing =
group_numbers = 0.up_to self.pattern.group_count
group_numbers.map n-> (self.text n . if_nothing default)
## Gets a map containing the named capturing groups for the pattern,
replacing the value for groups that did not participate in the match with
`default`.
Arguments:
- default: The value to return for a given name when the group at that
index did not participate in the match.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. `named_groups` will map
a named group that does not participate to the default value.
> Example
Get the map of all of the named groups in this match, replacing the
value for groups that didn't participate in the match with "UNMATCHED".
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
## match.named_groups.keys.sort == ["empty", "letters"]
named_groups : Any -> Map Text (Text | Any)
named_groups self default=Nothing =
named_group_names = self.pattern.group_names
spans = named_group_names.map name-> self.text name default=default
Map.from_vector (named_group_names.zip spans)
## Gets the grapheme span matched by the group with the provided index, or
a default value if the group did not participate in the match.
If the identifier is invalid then `if_missing` is returned.
Arguments:
- id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds.
get : Integer -> Any -> Text | Any
get self index ~if_missing=Nothing =
self.text index . catch No_Such_Group (_-> if_missing)
## Gets the grapheme span matched by the group with the provided index, or
a default value if the group did not participate in the match.
If the identifier is invalid then Index_Out_Of_Bounds is thrown.
Arguments:
- id: The integer index or name of that group.
- if_missing: The value to return if the index is out of bounds.
at : Integer -> Text ! Index_Out_Of_Bounds
at self index =
self.get index if_missing=(Error.throw (Index_Out_Of_Bounds.Error index self.pattern.group_count))

View File

@ -1,78 +1,183 @@
import project.Data.Boolean.Boolean
import project.Any.Any
import project.Data.Filter_Condition.Filter_Condition
import project.Data.Map.Map
import project.Data.Numbers.Integer
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Range.Extensions
import project.Data.Range.Range
import project.Data.Text.Helpers
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Regex.Replacer.Replacer
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Unimplemented.Unimplemented
import project.Errors.Common.Type_Error
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Meta
import project.Nothing.Nothing
import project.Polyglot.Polyglot
from project.Data.Boolean import Boolean, True, False
from project.Data.Index_Sub_Range import sort_and_merge_ranges
polyglot java import org.enso.base.Replacer_Cache
polyglot java import org.enso.base.Text_Utils
## The `Data.Text.Regex.Engine.Pattern` interface.
type Pattern
## internal_regex_object : RegexObject (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (internal_regex_object : Any)
## PRIVATE
Tries to match the provided `input` against the pattern `self`.
Arguments:
- input: The text to match the pattern described by `self` against.
- mode: The matching mode to use. This must default to `Regex_Mode.All`.
This method will _always_ return `Nothing` if it fails to match.
? Return Type
When asked to match in a mode that can only provide a single match, the
return type is either a single `Match` object. When asked to match in a
mode that permits multiple matches, it will always return a `Vector`,
even if only a single match is found.
match : Text -> (Regex_Mode | Matching_Mode) -> Match | Vector Match | Nothing
match self _ _ = Unimplemented.throw "This is an interface only."
## PRIVATE
Returns `True` if the input matches against the pattern described by
## Returns `True` if the input matches against the pattern described by
`self`, otherwise `False`.
Arguments:
- input: The text to check for matching.
matches : Text -> Boolean
matches self _ = Unimplemented.throw "This is an interface only."
matches : Text -> Boolean | Type_Error
matches self input =
Helpers.expect_text input <|
m = self.internal_regex_object.exec input 0
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
## PRIVATE
## Tries to match the provided `input` against the pattern `self`.
Tries to find the text in the `input` that matches against the pattern
`self`.
Returns a `Match` containing the matched text and its match groups, or
`Nothing` if the match failed.
Arguments:
- input: The text to find matches in.
- mode: The matching mode to use. This must default to `Regex_Mode.All`
- input: The text to match the pattern described by `self` against.
match : Text -> Match | Nothing | Type_Error
match self input =
Helpers.expect_text input <|
it = Match_Iterator.new self input
case it.next of
Match_Iterator_Value.Next _ match _ -> match
Match_Iterator_Value.Last _ -> Nothing
This method will _always_ return `Nothing` if it fails to find any
matches.
## Tries to match the provided `input` against the pattern `self`.
? Return Type
When asked to match in a mode that can only provide a single match, the
return type is either a single `Match` object. When asked to match in a
mode that permits multiple matches, it will always return a `Vector`,
even if only a single match is found.
find : Text -> (Regex_Mode | Matching_Mode) -> Text | Vector Text | Nothing
find self _ _ = Unimplemented.throw "This is an interface only."
Returns a `Vector Match` object, each containing the matched text
and its match groups.
## PRIVATE
Arguments:
- input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match ! Type_Error
match_all self input =
Helpers.expect_text input <|
pattern_is_empty = self.internal_regex_object.pattern == ''
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
builder = Vector.new_builder
it = Match_Iterator.new self input
go it = case it.next of
Match_Iterator_Value.Next _ match next_it ->
builder.append match
@Tail_Call go next_it
Match_Iterator_Value.Last _ -> Nothing
go it
builder.to_vector
## Tries to match the provided `input` against the pattern `self`.
Returns a `Text` containing the matched text, or `Nothing` if the match
failed.
Arguments:
- input: The text to match the pattern described by `self` against.
find : Text -> Text | Nothing | Type_Error
find self input =
Helpers.expect_text input <|
match_to_group_maybe <| self.match input
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Text`, each containing the matched text.
If the pattern does not match, an empty `Vector` is returned.
Arguments:
- input: The text to match the pattern described by `self` against.
find_all : Text -> Vector Text | Type_Error
find_all self input =
Helpers.expect_text input <|
self.match_all input . map match_to_group_maybe
## ADVANCED
Splits the `input` text based on the pattern described by `self`.
Arguments:
- input: The text to split based on the pattern described by `self`.
- mode: The splitting mode to use. This must default to `Regex_Mode.All`.
- only_first: If true, only split at the first occurrence.
This method will _always_ return a vector. If no splits take place, the
vector will contain a single element.
split : Text -> (Matching_Mode | Integer | Regex_Mode) -> Vector Text
split self _ _ = Unimplemented.throw "This is an interface only."
vector will contain a single element (equal to the original string).
## PRIVATE
> Example
Split on the first instance of the pattern.
pattern = Regex.compile "cd"
input = "abcdefcdghij"
texts = pattern.split input only_first=True
texts . should_equal ["ab", "efcdghij"]
> Example
Split on the all instances of the pattern in the input.
pattern = Regex.compile "a"
input = "bacadaeaf"
texts = pattern.split input
texts . should_equal ["b", "c", "d", "e", "f"]
> Example
Returns the original text if there are no matches.
pattern = Regex.compile "aa"
input = "abcdefghij"
texts = pattern.split input
texts . should_equal ["abcdefghij"]
split : Text -> Boolean -> Vector Text | Type_Error
split self input only_first=False =
Helpers.expect_text input <|
builder = Vector.new_builder
it = Match_Iterator.new self input
go next = case next of
Match_Iterator_Value.Next filler _ next_it ->
builder.append filler.text
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next
Match_Iterator_Value.Last filler ->
builder.append filler.text
go it.next
builder.to_vector
## ADVANCED
Takes an input string and returns all the matches as a `Vector Text`.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned. Non-participating
groups are omitted.
Arguments:
- input: The text to tokenize.
> Example
Split to blocks of 3 characters.
Regex.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
> Example
Split to blocks of 3 characters taking first and third letters.
Regex.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
> Example
Split a text on any white space.
Regex.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
== ['Hello','Big','Wide','World','Goodbye!']
tokenize : Text -> Vector Text
tokenize self input =
self.match_all input . map (build_tokenization_output_from_match self _)
## ADVANCED
Replace all occurrences of the pattern described by `self` in the `input`
with the specified `replacement`.
@ -80,10 +185,250 @@ type Pattern
Arguments:
- input: The text in which to perform the replacement(s).
- replacement: The literal text with which to replace any matches.
- mode: The matching mode to use for finding candidates to replace. This
must default to `Regex_Mode.All`.
- only_first: If True, only replace the first match.
If this method performs no replacements it will return the `input` text
unchanged.
replace : Text -> Text -> Regex_Mode | Matching_Mode | Integer -> Text
replace self _ _ _ = Unimplemented.throw "This is an interface only."
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
> Example
Replace letters in the text "aa".
pattern = Regex.compile 'aa'
pattern.replace 'aaa' 'b' == 'ba'
> Example
Replace all occurrences of letters 'l' and 'o' with '#'.
pattern = Regex.compile '[lo]'
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
> Example
Replace the first occurrence of letter 'l' with '#'.
pattern = Regex.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
> Example
Replace texts in quotes with parentheses.
pattern = Regex.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
> Example
Replace a literal string with a replacement value.
pattern = Regex.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match == "xyz ab xyz ac ad xyz xyz ax"
> Example
Replace each word with the same word surrounded by `[]`.
pattern = Regex.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
replace : Text -> Text -> Boolean -> Text | Type_Error
replace self input replacement only_first=False =
Helpers.expect_text input <|
it = Match_Iterator.new self input
case it of
Match_Iterator_Value.Last filler -> filler.text
_ ->
replacer = Replacer.new replacement self
replacer.if_not_error <|
go next current = case next of
Match_Iterator_Value.Next filler match next_it ->
new_value = current + filler.text + (replacer.replace match)
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next new_value
Match_Iterator_Value.Last filler ->
current + filler.text
go it.next ""
## PRIVATE
Look up a match group name or number, and check that it is valid.
Arguments:
- id: The name or number of the group that was asked for.
Returns: a group number.
A group number is invalid if it is outside the range of groups
that were in the original pattern.
A group name is invalid if it was not defined in the original pattern.
A group name is an alias for a group number; if a name is passed to
this method, it returns the corresponding group number.
If a group number is passed to `lookup_group` and it is valid, it will
simply return the group number.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
Match.group will return Nothing.
lookup_group : Integer | Text -> Integer ! No_Such_Group
lookup_group self id =
case id of
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
True -> n
False -> Error.throw (No_Such_Group.Error n)
name : Text ->
# Maps name to number
groups = self.internal_regex_object.groups
n = case groups of
# If Nothing, there are no named groups
Nothing -> Error.throw (No_Such_Group.Error name)
_ ->
qq = (read_group_map groups name)
case qq of
Nothing -> Nothing
n : Integer -> n
case n of
_ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name)
## PRIVATE
Return a lazy iterator over matches against a string.
Arguments
- text: the string to match against.
iterator : Text -> Match_Iterator
iterator self input = Match_Iterator.new self input
## Return the number of groups in the underlying RegexObject.
Note, the count includes group 0 (the whole match) as well.
group_count : Integer
group_count self = self.internal_regex_object.groupCount
## Return a vector of all named group names.
group_names : Map Text Integer
group_names self =
map = polyglot_map_to_map self.internal_regex_object.groups
map.keys
## PRIVATE
Performs the regex match, and iterates through the results. Yields both
the matched parts of the string, and the 'filler' parts between them.
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
matches and replacement boundaries can fall in the middle of multi-
character graphemes, thereby splitting them apart.
At each step, it yields a Match_Iterator_Value, whivch has either a filler
and a match, or just the final filler. A Match_Iterator_Value.Last value is
return at the end, and only at the end.
Optionally, you can call `early_exit` to have it return the remainder of
the string, unmatched, as a single Last value. (Used for `replace` with
`only_first=True`.)
type Match_Iterator
new : Pattern -> Text -> Match_Iterator
new pattern input = Match_Iterator.Value pattern input 0
Value (pattern : Pattern) (input : Text) (cursor : Integer)
## Return the next match, or the last filler string if there is no
additional match.
Also returns the next iterator, if there was a match.
next : Match_Iterator_Value
next self =
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
case regex_result.isMatch of
False ->
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = (Utf_16_Span.Value filler_range self.input)
Match_Iterator_Value.Last filler_span
True ->
match_start = regex_result.getStart 0
filler_range = Range.new self.cursor match_start
filler_span = (Utf_16_Span.Value filler_range self.input)
match = Match.Value self.pattern regex_result self.input
next_cursor = match.utf_16_end 0
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
Match_Iterator_Value.Next filler_span match next_iterator
## Returns the remainder of the string, unmatched.
early_exit : Match_Iterator_Value
early_exit self =
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = Utf_16_Span.Value filler_range self.input
Match_Iterator_Value.Last filler_span
to_text_debug : Vector Text
to_text_debug self =
vb = Vector.new_builder
go it = case it.next of
Match_Iterator_Value.Next filler match next_it ->
vb.append ('\"' + filler.text + '\"')
vb.append ("/" + (match.span 0).text + "/")
go next_it
Match_Iterator_Value.Last filler ->
vb.append ('\"' + filler.text + '\"')
go self
vb.to_vector
## PRIVATE
type Match_Iterator_Value
Next (filler : Span) (match : Match) (next_iterator : Match_Iterator)
Last (filler : Span)
## PRIVATE
Convert the polyglot map to a Map.
polyglot_map_to_map : Any -> Map Any Any
polyglot_map_to_map map =
polyglot_keys = Polyglot.get_members map
keys = Vector.from_polyglot_array polyglot_keys
pairs = keys.map key-> [key, Polyglot.get_member map key]
Map.from_vector pairs
## PRIVATE
Get the named group from the polyglot map.
read_group_map : Any -> Text -> Integer | Nothing
read_group_map polyglot_map name =
map = polyglot_map_to_map polyglot_map
map.get name
## PRIVATE
match_to_group_maybe : Match | Nothing -> Text | Nothing
match_to_group_maybe match =
if match.is_nothing then Nothing else match.text 0
## PRIVATE
Build an output string from a Match resulting from `tokenize`.
See `tokenize`.
build_tokenization_output_from_match : Pattern -> Match -> Text
build_tokenization_output_from_match pattern match =
if pattern.group_count == 1 then match.text 0 else
# Extract the ranges of the spans of all capturing groups
group_numbers = 1.up_to pattern.group_count
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
ranges = spans.map span-> case span of Span.Value range _ -> range
# Eliminate nested capturing groups by sorting and merging the ranges.
top_level_ranges = sort_and_merge_ranges ranges
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
text_all = case spans.at 0 of Span.Value _ text -> text
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
top_level_spans.map (.text) . join

View File

@ -1,425 +0,0 @@
import project.Any.Any
import project.Data.Filter_Condition.Filter_Condition
import project.Data.Map.Map
import project.Data.Numbers.Integer
import project.Data.Range.Extensions
import project.Data.Range.Range
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match_2.Match_2
import project.Data.Text.Regex.Replacer.Replacer
import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Meta
import project.Nothing.Nothing
import project.Polyglot.Polyglot
from project.Data.Boolean import Boolean, True, False
from project.Data.Index_Sub_Range import sort_and_merge_ranges
polyglot java import org.enso.base.Replacer_Cache
polyglot java import org.enso.base.Text_Utils
type Pattern_2
## internal_regex_object : RegexObject (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (internal_regex_object : Any)
## Returns `True` if the input matches against the pattern described by
`self`, otherwise `False`.
Arguments:
- input: The text to check for matching.
matches : Text -> Boolean
matches self input =
m = self.internal_regex_object.exec input 0
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
## Tries to match the provided `input` against the pattern `self`.
Returns a `Match_2` containing the matched text and its match groups, or
`Nothing` if the match failed.
Arguments:
- input: The text to match the pattern described by `self` against.
match : Text -> Match_2 | Nothing
match self input =
it = Match_Iterator.new self input
case it.next of
Match_Iterator_Value.Next _ match _ -> match
Match_Iterator_Value.Last _ -> Nothing
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Match_2` object, each containing the matched text
and its match groups.
Arguments:
- input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match_2 ! Illegal_Argument
match_all self input =
pattern_is_empty = self.internal_regex_object.pattern == ''
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
builder = Vector.new_builder
it = Match_Iterator.new self input
go it = case it.next of
Match_Iterator_Value.Next _ match next_it ->
builder.append match
@Tail_Call go next_it
Match_Iterator_Value.Last _ -> Nothing
go it
builder.to_vector
## Tries to match the provided `input` against the pattern `self`.
Returns a `Text` containing the matched text, or `Nothing` if the match
failed.
Arguments:
- input: The text to match the pattern described by `self` against.
find : Text -> Text | Nothing
find self input =
match_to_group_maybe <| self.match input
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Text`, each containing the matched text.
If the pattern does not match, an empty `Vector` is returned.
Arguments:
- input: The text to match the pattern described by `self` against.
find_all : Text -> Vector Text
find_all self input =
self.match_all input . map match_to_group_maybe
## ADVANCED
Splits the `input` text based on the pattern described by `self`.
Arguments:
- input: The text to split based on the pattern described by `self`.
- only_first: If True, only split at the first occurrence.
This method will _always_ return a vector. If no splits take place, the
vector will contain a single element (equal to the original string).
> Example
Split on the first instance of the pattern.
pattern = Regex_2.compile "cd"
input = "abcdefcdghij"
texts = pattern.split input only_first=True
texts . should_equal ["ab", "efcdghij"]
> Example
Split on the all instances of the pattern in the input.
pattern = Regex_2.compile "a"
input = "bacadaeaf"
texts = pattern.split input
texts . should_equal ["b", "c", "d", "e", "f"]
> Example
Returns the original text if there are no matches.
pattern = Regex_2.compile "aa"
input = "abcdefghij"
texts = pattern.split input
texts . should_equal ["abcdefghij"]
split : Text -> Boolean -> Vector Text
split self input only_first=False =
builder = Vector.new_builder
it = Match_Iterator.new self input
go next = case next of
Match_Iterator_Value.Next filler _ next_it ->
builder.append filler.text
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next
Match_Iterator_Value.Last filler ->
builder.append filler.text
go it.next
builder.to_vector
## ADVANCED
Takes an input string and returns all the matches as a `Vector Text`.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned. Non-participating
groups are omitted.
Arguments:
- input: The text to tokenize.
> Example
Split to blocks of 3 characters.
Regex_2.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
> Example
Split to blocks of 3 characters taking first and third letters.
Regex_2.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
> Example
Split a text on any white space.
Regex_2.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
== ['Hello','Big','Wide','World','Goodbye!']
tokenize : Text -> Vector Text
tokenize self input =
self.match_all input . map (build_tokenization_output_from_match self _)
## ADVANCED
Replace all occurrences of the pattern described by `self` in the `input`
with the specified `replacement`.
Arguments:
- input: The text in which to perform the replacement(s).
- replacement: The literal text with which to replace any matches.
- only_first: If True, only replace the first match.
If this method performs no replacements it will return the `input` text
unchanged.
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
> Example
Replace letters in the text "aa".
pattern = Regex_2.compile 'aa'
pattern.replace 'aaa' 'b' == 'ba'
> Example
Replace all occurrences of letters 'l' and 'o' with '#'.
pattern = Regex_2.compile '[lo]'
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
> Example
Replace the first occurrence of letter 'l' with '#'.
pattern = Regex_2.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
> Example
Replace texts in quotes with parentheses.
pattern = Regex_2.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
> Example
Replace a literal string with a replacement value.
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match == "xyz ab xyz ac ad xyz xyz ax"
> Example
Replace each word with the same word surrounded by `[]`.
pattern = Regex_2.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
replace : Text -> Text -> Boolean -> Text
replace self input replacement only_first=False =
it = Match_Iterator.new self input
case it of
Match_Iterator_Value.Last filler -> filler.text
_ ->
replacer = Replacer.new replacement self
replacer.if_not_error <|
go next current = case next of
Match_Iterator_Value.Next filler match next_it ->
new_value = current + filler.text + (replacer.replace match)
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next new_value
Match_Iterator_Value.Last filler ->
current + filler.text
go it.next ""
## PRIVATE
Look up a match group name or number, and check that it is valid.
Arguments:
- id: The name or number of the group that was asked for.
Returns: a group number.
A group number is invalid if it is outside the range of groups
that were in the original pattern.
A group name is invalid if it was not defined in the original pattern.
A group name is an alias for a group number; if a name is passed to
this method, it returns the corresponding group number.
If a group number is passed to `lookup_group` and it is valid, it will
simply return the group number.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
Match_2.group will return Nothing.
lookup_group : Integer | Text -> Integer ! No_Such_Group
lookup_group self id =
case id of
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
True -> n
False -> Error.throw (No_Such_Group.Error n)
name : Text ->
# Maps name to number
groups = self.internal_regex_object.groups
n = case groups of
# If Nothing, there are no named groups
Nothing -> Error.throw (No_Such_Group.Error name)
_ ->
qq = (read_group_map groups name)
case qq of
Nothing -> Nothing
n : Integer -> n
case n of
_ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name)
## PRIVATE
Return a lazy iterator over matches against a string.
Arguments
- text: the string to match against.
iterator : Text -> Match_Iterator
iterator self input = Match_Iterator.new self input
## Return the number of groups in the underlying RegexObject.
Note, the count includes group 0 (the whole match) as well.
group_count : Integer
group_count self = self.internal_regex_object.groupCount
## Return a vector of all named group names.
group_names : Map Text Integer
group_names self =
map = polyglot_map_to_map self.internal_regex_object.groups
map.keys
## PRIVATE
Performs the regex match, and iterates through the results. Yields both
the matched parts of the string, and the 'filler' parts between them.
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
matches and replacement boundaries can fall in the middle of multi-
character graphemes, thereby splitting them apart.
At each step, it yields a Match_Iterator_Value, whivch has either a filler
and a match, or just the final filler. A Match_Iterator_Value.Last value is
return at the end, and only at the end.
Optionally, you can call `early_exit` to have it return the remainder of
the string, unmatched, as a single Last value. (Used for `replace` with
`only_first=True`.)
type Match_Iterator
new : Pattern_2 -> Text -> Match_Iterator
new pattern input = Match_Iterator.Value pattern input 0
Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
## Return the next match, or the last filler string if there is no
additional match.
Also returns the next iterator, if there was a match.
next : Match_Iterator_Value
next self =
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
case regex_result.isMatch of
False ->
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = (Utf_16_Span.Value filler_range self.input)
Match_Iterator_Value.Last filler_span
True ->
match_start = regex_result.getStart 0
filler_range = Range.new self.cursor match_start
filler_span = (Utf_16_Span.Value filler_range self.input)
match = Match_2.Value self.pattern regex_result self.input
next_cursor = match.utf_16_end 0
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
Match_Iterator_Value.Next filler_span match next_iterator
## Returns the remainder of the string, unmatched.
early_exit : Match_Iterator_Value
early_exit self =
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = Utf_16_Span.Value filler_range self.input
Match_Iterator_Value.Last filler_span
to_text_debug : Vector Text
to_text_debug self =
vb = Vector.new_builder
go it = case it.next of
Match_Iterator_Value.Next filler match next_it ->
vb.append ('\"' + filler.text + '\"')
vb.append ("/" + (match.span 0).text + "/")
go next_it
Match_Iterator_Value.Last filler ->
vb.append ('\"' + filler.text + '\"')
go self
vb.to_vector
## PRIVATE
type Match_Iterator_Value
Next (filler : Span) (match : Match_2) (next_iterator : Match_Iterator)
Last (filler : Span)
## PRIVATE
Convert the polyglot map to a Map.
polyglot_map_to_map : Any -> Map Any Any
polyglot_map_to_map map =
polyglot_keys = Polyglot.get_members map
keys = Vector.from_polyglot_array polyglot_keys
pairs = keys.map key-> [key, Polyglot.get_member map key]
Map.from_vector pairs
## PRIVATE
Get the named group from the polyglot map.
read_group_map : Any -> Text -> Integer | Nothing
read_group_map polyglot_map name =
map = polyglot_map_to_map polyglot_map
map.get name
## PRIVATE
match_to_group_maybe : Match_2 | Nothing -> Text | Nothing
match_to_group_maybe match =
if match.is_nothing then Nothing else match.text 0
## PRIVATE
Build an output string from a Match_2 resulting from `tokenize`.
See `tokenize`.
build_tokenization_output_from_match : Pattern_2 -> Match_2 -> Text
build_tokenization_output_from_match pattern match =
if pattern.group_count == 1 then match.text 0 else
# Extract the ranges of the spans of all capturing groups
group_numbers = 1.up_to pattern.group_count
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
ranges = spans.map span-> case span of Span.Value range _ -> range
# Eliminate nested capturing groups by sorting and merging the ranges.
top_level_ranges = sort_and_merge_ranges ranges
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
text_all = case spans.at 0 of Span.Value _ text -> text
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
top_level_spans.map (.text) . join

View File

@ -1,28 +0,0 @@
## A description of how the regex engine will match on the content.
This lets you configure how you want to match, from the `First` match only,
to matching on the `Full` content of the input text.
import project.Data.Numbers.Integer
import project.Data.Text.Matching_Mode.Matching_Mode
type Regex_Mode
## The regex will make all possible matches.
All
## The regex will only match if the _entire_ text matches.
Full
## The regex will only match within the region defined by start..end.
Arguments:
- start: The inclusive start bound of the region.
- end: The exclusive end bound of the region.
- mode: The mode to use within the bounded region.
! Units
The `start` and `end` indices range over _characters_ in the text. The
precise definition of `character` is, for the moment, defined by the
regular expression engine itself.
Bounded (start : Integer) (end : Integer) (mode : (Matching_Mode.First | Matching_Mode.Last | Regex_Mode) = Regex_Mode.All)

View File

@ -1,44 +0,0 @@
## Options are used to configure how a regex engine behaves.
In this file, Enso provides a set of standard options that must be supported
by all regex engines integrated with Enso.
type Regex_Option
## Specifies that all predefined character classes and POSIX character
classes will match _only_ on ASCII characters.
! Performance
If you are _sure_ that your data can only ever contain characters from
the ASCII character set, you may be able to obtain a performance boost
by specifying this flag. This may not be the case on all engines or all
regexes.
Ascii_Matching
## Specifies that matching should be performed in a case-insensitive manner.
Case_Insensitive
## Specifies that the regular expression should be interpreted in comments
mode.
Comments mode has the following changes:
- Whitespace within the pattern is ignored, except when within a
character class or when preceded by an unescaped backslash, or within
grouping constructs (e.g. `(?...)`).
- When a line contains a `#`, that is not in a character class and is not
preceded by an unescaped backslash, all characters from the leftmost
such `#` to the end of the line are ignored. That is to say, they act
as _comments_ in the regex.
Comments
## Specifies that the `.` special character should match everything
_including_ newline characters. Without this flag, it will match all
characters _except_ newlines.
Dot_Matches_Newline
## Specifies that the pattern character `^` matches at both the beginning of
the string and at the beginning of each line (immediately following a
newline), and that the pattern character `$` matches at the end of each
line _and_ at the end of the string.
Multiline

View File

@ -1,10 +1,10 @@
import project.Data.Numbers.Integer
import project.Data.Text.Extensions
import project.Data.Text.Regex.Match_2.Match_2
import project.Data.Text.Regex.Pattern_2.Match_Iterator_Value
import project.Data.Text.Regex.Pattern_2.Pattern_2
import project.Data.Text.Regex_2
import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Regex
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Regex.Pattern.Match_Iterator_Value
import project.Data.Text.Regex.Pattern.Pattern
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Data.Vector.Vector
@ -23,7 +23,7 @@ type Replacer
Implements a replacement for a regular expression.
Pattern_2.replace uses a Replacer to replace each regex match with
Pattern.replace uses a Replacer to replace each regex match with
a replacement string. This string can contain references to match
groups from the original regex.
@ -40,7 +40,7 @@ type Replacer
Arguments
- replacement_string: a string, possibly containing group references,
that will be used to provide a replacement in a regex match.
new : Text -> Pattern_2 -> Replacer ! No_Such_Group
new : Text -> Pattern -> Replacer ! No_Such_Group
new replacement_string pattern =
Replacer.Value (build_replacement_vector_cached replacement_string pattern)
@ -48,7 +48,7 @@ type Replacer
Arguments:
- match: the match from the original string that is to be replaced.
replace : Match_2 -> Text
replace : Match -> Text
replace self match =
string_builder = StringBuilder.new
self.replacement.each replacement->
@ -82,7 +82,7 @@ group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))"
Uses Replacement_Cache to avoid rebuilding the vector for recently used
replacement strings.
build_replacement_vector_cached : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
build_replacement_vector_cached : Text -> Pattern -> Vector Replacement ! No_Such_Group
build_replacement_vector_cached replacement_string pattern =
Replacer_Cache.get_or_set replacement_string _->
build_replacement_vector replacement_string pattern
@ -93,9 +93,9 @@ build_replacement_vector_cached replacement_string pattern =
Parse the replacement string into an alternating series of literal
strings and group reference numbers.
build_replacement_vector : Text -> Pattern_2 -> Vector Replacement ! No_Such_Group
build_replacement_vector : Text -> Pattern -> Vector Replacement ! No_Such_Group
build_replacement_vector replacement_string pattern =
replacement_pattern = Regex_2.compile group_reference_regex
replacement_pattern = Regex.compile group_reference_regex
it = replacement_pattern.iterator replacement_string
builder = Vector.new_builder
@ -117,14 +117,14 @@ build_replacement_vector replacement_string pattern =
Parse a capture group reference.
Arguments:
- pattern: the Pattern_2 used to initiate the replacement. This is used
- pattern: the Pattern used to initiate the replacement. This is used
to identify and validate capture groups.
- match: the match of the replacement string against group_reference_regex.
Returns a Replacement: a group number, or, in the case of `$$`, a literal.
See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
parse_group_number : Pattern_2 -> Match_2 -> Replacement ! No_Such_Group
parse_group_number : Pattern -> Match -> Replacement ! No_Such_Group
parse_group_number pattern match = case match.text.take 2 of
"$$" -> Replacement.Literal "$"
"$<" ->

View File

@ -1,86 +0,0 @@
import project.Any.Any
import project.Data.Numbers.Integer
import project.Data.Text.Prim_Text_Helper
import project.Data.Text.Regex.Pattern_2.Pattern_2
import project.Data.Text.Text
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
from project.Errors.Common import Syntax_Error
polyglot java import java.util.regex.Pattern as Java_Pattern
## Compile the provided `expression` into a regex pattern that can be used for
matching.
Arguments
- expression: The text representing the regular expression that you want to
compile. Must be non-empty.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
If an empty regex is used, `compile` throws an Illegal_Argument error.
? Why Compile?
While many regex engines are able to cache ad-hoc patterns, it is often
useful to be able to manually retain a pattern that you have computed. This
function exists so you can hold onto the resultant `Pattern_2` object,
instead of immediately proceeding to match using it.
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error | Illegal_Argument
compile self expression case_insensitive=Nothing =
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
options_string = if case_insensitive == True then "usgi" else "usg"
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
Pattern_2.Value internal_regex_object
## ADVANCED
Escape the special characters in `expression` such that the result is a
valid literal pattern for the original string.
Arguments:
- expression: The expression to escape metacharacters in.
> Example
Turn a Text into a regex that matches that string exactly.
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
example_escape =
literal_string = "\!\.|abcde"
engine = Default_Engine.new
engine.escape literal_string
escape : Text -> Text
escape self expression = Java_Pattern.quote expression
## An error that is emitted when there is no such group in the match for the
provided `id`.
Arguments:
- id: The identifier of the group that was asked for but does not exist.
type No_Such_Group
Error (id : Text | Integer)
## PRIVATE
Provides a human-readable representation of the `No_Such_Group`.
to_display_text : Text
to_display_text self = case self.id of
_ : Integer -> "No group exists with the index " + self.id.to_text + "."
_ : Text -> "No group exists with the name " + self.id + "."
## A syntax error reported by the Truffle regex compiler.
type Regex_Syntax_Error
## PRIVATE
Arguments:
- message: A description of the erroneous syntax.
Error message

View File

@ -1,112 +0,0 @@
import project.Any.Any
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Data.Text.Matching
import project.Data.Text.Regex
import project.Data.Text.Regex.Pattern.Pattern
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Problem_Behavior.Problem_Behavior
from project.Data.Boolean import Boolean, True, False
## Represents regex matching mode.
type Regex_Matcher
## Regex matching mode.
Arguments:
- case_sensitivity: Specifies whether the matching should be case
sensitive.
- multiline: Enables or disables the multiline option. Multiline
specifies that the `^` and `$` pattern characters match the start and
end of lines, as to well as the start and end of the input,
respectively.
- match_ascii: Enables or disables pure-ASCII matching for the regex. If
you know your data only contains ASCII, you can enable this for a
performance boost on some regex engines.
- dot_matches_newline: Enables or disables the dot matches newline
option. This specifies that the `.` special character should match
everything _including_ newline characters. Without this flag, it
matches all characters _except_ newlines.
- comments: Enables or disables the comments mode for the regular
expression. In comments mode, the following changes apply:
- Whitespace within the pattern is ignored, except when within a
character class or when preceded by an unescaped backslash, or within
grouping constructs (e.g. `(?...)`).
- When a line contains a `#` that is not in a character class and is
not preceded by an unescaped backslash, all characters from the
leftmost such `#` to the end of the line are ignored. That is to say;
they act as 'comments' in the regex.
Value (case_sensitivity : Case_Sensitivity = Case_Sensitivity.Sensitive) (multiline : Boolean = False) (match_ascii : Boolean = False) (dot_matches_newline : Boolean = False) (comments : Boolean = False)
## UNSTABLE
Compiles a provided pattern according to the rules defined in this
`Regex_Matcher`.
compile : Text -> Pattern
compile self pattern =
case_insensitive = case self.case_sensitivity of
Case_Sensitivity.Default -> False
Case_Sensitivity.Sensitive -> False
## TODO [RW] Currently locale is not supported in case-insensitive
Regex matching. There are plans to revisit it:
https://www.pivotaltracker.com/story/show/181313576
Case_Sensitivity.Insensitive _ -> True
Regex.compile pattern case_insensitive=case_insensitive match_ascii=self.match_ascii dot_matches_newline=self.dot_matches_newline multiline=self.multiline comments=self.comments
## UNSTABLE
Checks if a name matches the provided criterion according to the specified
matching strategy.
Arguments:
- name: A `Text` representing the name being matched.
- criterion: A `Text` representing the regular expression specifying the
matching criterion.
> Example
Check if the provided name matches a regular expression.
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive) . match_single_criterion "Foobar" "f.*" == True
match_single_criterion : Text -> Text -> Boolean
match_single_criterion self name criterion =
self.compile criterion . matches name
## UNSTABLE
Selects objects from an input list that match any of the provided criteria.
Arguments:
- objects: A list of objects to be matched.
- criteria: A list of texts representing the matching criteria. Their meaning
depends on the matching strategy.
- reorder: Specifies whether to reorder the matched objects according to the
order of the matching criteria.
If `False`, the matched entries are returned in the same order as in the
input.
If `True`, the matched entries are returned in the order of the criteria
matching them. If a single object has been matched by multiple criteria, it
is placed in the group belonging to the first matching criterion on the
list.
If a single criterion's group has more than one element, their relative
order is the same as in the input.
- name_mapper: A function mapping a provided object to its name, which will
then be matched with the criteria. It is set to the identity function by
default, thus allowing the input to be a list of names to match. But it can
be overridden to enable matching more complex objects.
- matcher: A `Matcher` instance specifying how to interpret the criterion.
- on_problems: Specifies the behavior when a problem occurs during the
function.
By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
> Example
Selects objects matching one of the provided patterns, preserving the input order.
Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive . match_criteria ["foo", "foobar", "quux", "baz", "Foo"] [".*ba.*", "f.*"] == ["foo", "foobar", "baz"]
> Example
Selects pairs matching their first element with the provided criteria and
ordering the result according to the order of criteria that matched them.
Text_Matcher.match_criteria [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10] ["bar", "foo"] reorder=True name_mapper=_.name == [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0]
match_criteria : Vector Any -> Vector Text -> Boolean -> (Any -> Text) -> Problem_Behavior -> Vector Any ! Matching.No_Matches_Found
match_criteria self objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning =
Matching.match_criteria_implementation self objects criteria reorder name_mapper on_problems

View File

@ -1,75 +0,0 @@
import project.Any.Any
import project.Data.Locale.Locale
import project.Data.Text.Matching
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Problem_Behavior.Problem_Behavior
from project.Data.Boolean import Boolean, True, False
## Represents exact text matching mode.
type Text_Matcher
## Represents exact text matching mode.
Case_Sensitive
## Represents case-insensitive text matching mode.
Case_Insensitive (locale:Locale=Locale.default)
## UNSTABLE
Checks if a name matches the provided criterion according to the specified
matching strategy.
Arguments:
- name: A `Text` representing the name being matched.
- criterion: A `Text` representing the name to be matched.
> Example
Check if the provided name matches a regular expression.
Text_Matcher.match_single_criterion "Foobar" "foo" == False
match_single_criterion : Text -> Text -> Boolean
match_single_criterion self name criterion = case self of
Text_Matcher.Case_Sensitive -> name == criterion
Text_Matcher.Case_Insensitive locale -> name.equals_ignore_case criterion locale=locale
## UNSTABLE
Selects objects from an input list that match any of the provided criteria.
Arguments:
- objects: A list of objects to be matched.
- criteria: A list of texts representing the matching criteria. Their meaning
depends on the matching strategy.
- reorder: Specifies whether to reorder the matched objects according to the
order of the matching criteria.
If `False`, the matched entries are returned in the same order as in the
input.
If `True`, the matched entries are returned in the order of the criteria
matching them. If a single object has been matched by multiple criteria, it
is placed in the group belonging to the first matching criterion on the
list.
If a single criterion's group has more than one element, their relative
order is the same as in the input.
- name_mapper: A function mapping a provided object to its name, which will
then be matched with the criteria. It is set to the identity function by
default, thus allowing the input to be a list of names to match. But it can
be overridden to enable matching more complex objects.
- matcher: A `Matcher` instance specifying how to interpret the criterion.
- on_problems: Specifies the behavior when a problem occurs during the
function.
By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
> Example
Selects objects matching one of the provided patterns, preserving the input order.
Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive . match_criteria ["foo", "foobar", "quux", "baz", "Foo"] [".*ba.*", "f.*"] == ["foo", "foobar", "baz"]
> Example
Selects pairs matching their first element with the provided criteria and
ordering the result according to the order of criteria that matched them.
Text_Matcher.match_criteria [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10] ["bar", "foo"] reorder=True name_mapper=_.name == [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0]
match_criteria : Vector Any -> Vector Text -> Boolean -> (Any -> Text) -> Problem_Behavior -> Vector Any ! Matching.No_Matches_Found
match_criteria self objects criteria reorder=False name_mapper=(x->x) on_problems=Problem_Behavior.Report_Warning =
Matching.match_criteria_implementation self objects criteria reorder name_mapper on_problems

View File

@ -94,10 +94,6 @@ import project.Data.Text.Line_Ending_Style.Line_Ending_Style
import project.Data.Text.Location.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
import project.Data.Text.Regex.Regex_Option.Regex_Option
import project.Data.Text.Regex_Matcher.Regex_Matcher
import project.Data.Text.Text_Matcher.Text_Matcher
import project.Data.Text.Text_Ordering.Text_Ordering
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
import project.Data.Time.Date.Date
@ -146,10 +142,6 @@ export project.Data.Text.Line_Ending_Style.Line_Ending_Style
export project.Data.Text.Location.Location
export project.Data.Text.Matching_Mode.Matching_Mode
export project.Data.Text.Regex
export project.Data.Text.Regex.Regex_Mode.Regex_Mode
export project.Data.Text.Regex.Regex_Option.Regex_Option
export project.Data.Text.Regex_Matcher.Regex_Matcher
export project.Data.Text.Text_Matcher.Text_Matcher
export project.Data.Text.Text_Ordering.Text_Ordering
export project.Data.Text.Text_Sub_Range.Text_Sub_Range
export project.Data.Time.Date.Date

View File

@ -1,12 +1,10 @@
from Standard.Base import all
import Standard.Base.Errors.Common.No_Such_Method
import Standard.Base.Data.Text.Regex.Match.Match
import Standard.Base.Errors.Common.No_Such_Method
import Standard.Base.Network.HTTP.Response.Response
import Standard.Base.Network.HTTP.Response_Body.Response_Body
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Text.Regex.Engine.Default.Match as Default_Engine_Match
from Standard.Table import Table, Column
from Standard.Image import Image, Read_Flag, Matrix
@ -269,8 +267,7 @@ transactions_table =
(enso_project.data / "food_shop_transactions.csv") . read
## An example regex match.
match : Default_Engine_Match
match : Match
match =
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
pattern.match "aa ab abc a bc bcd" mode=Matching_Mode.First
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
pattern.match "aa ab abc a bc bcd"

View File

@ -6,71 +6,6 @@ import java.util.regex.Pattern;
public class Regex_Utils {
/**
* Obtains the names for named groups.
*
* <p>Assumes that the provided {@link Pattern} is syntactically valid. Behaviour is undefined if
* run on a syntactically invalid pattern.
*
* @param pattern the pattern for which to get the group names
* @return the names for the named groups in {@code pattern}
*/
public static String[] get_group_names(Pattern pattern) {
String pattern_text = pattern.pattern();
char[] characters = pattern_text.toCharArray();
ArrayList<String> names = new ArrayList<>();
for (int i = 0; i < pattern_text.length(); ++i) {
char character = characters[i];
if (character == '\\') {
++i;
break;
}
String header = "(?<";
if (pattern_text.startsWith(header, i)) {
i += header.length();
StringBuilder buffer = new StringBuilder();
while (i < pattern_text.length()) {
character = characters[i];
if (character == '>') {
break;
}
++i;
buffer.append(character);
}
names.add(buffer.toString());
}
}
return names.toArray(new String[0]);
}
/**
* Looks for matches of the provided regular expression in the provided text.
*
* <p>This should behave exactly the same as `Regex.compile regex . find text` in Enso, it is here
* only as a temporary workaround, because the Enso function gives wrong results on examples like
* `Regex.compile "([0-9]+|[^0-9]+)" . find "1a2c"` where it returns `[1, a, 2]` instead of `[1,
* a, 2, c]`.
*/
public static String[] find_all_matches(String regex, String text) {
var allMatches = new ArrayList<String>();
Matcher m = Pattern.compile(regex).matcher(text);
while (m.find()) {
allMatches.add(m.group());
}
return allMatches.toArray(new String[0]);
}
/**
* Converts a SQL-like pattern into a Regex with the same semantics.
*
@ -87,7 +22,7 @@ public class Regex_Utils {
// Before inserting the converted wildcard, we append the accumulated characters, quoting
// them first.
if (acc.length() > 0) {
result.append(Pattern.quote(acc.toString()));
result.append(regexQuote(acc.toString()));
acc.setLength(0);
}
@ -103,7 +38,7 @@ public class Regex_Utils {
// If any trailing characters were left, we append them too.
if (acc.length() > 0) {
result.append(Pattern.quote(acc.toString()));
result.append(regexQuote(acc.toString()));
}
return result.toString();

View File

@ -8,7 +8,7 @@ type Setup
make_expected_output_regex expected_output =
parts = expected_output.split "???" . map Regex.escape
Regex.compile (parts.join ".+") dot_matches_newline=True
Regex.compile (parts.join ".+")
spec setup =
run_test source_path =

View File

@ -2,6 +2,7 @@ from Standard.Base import all
import Standard.Base.Data.Range.Empty_Error
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
import Standard.Base.Errors.Common.No_Such_Method
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Common.Unsupported_Argument_Types
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Illegal_State.Illegal_State
@ -143,8 +144,8 @@ spec = Test.group "Range" <|
range.filter (Filter_Condition.Not_In [7, 3, 2]) . should_equal [1, 4, 5]
Test.expect_panic_with (range.filter (Filter_Condition.Starts_With "a")) No_Such_Method
Test.expect_panic_with (range.filter (Filter_Condition.Like "a%")) Unsupported_Argument_Types
Test.expect_panic_with (range.filter (Filter_Condition.Not_Like "a_")) Unsupported_Argument_Types
range.filter (Filter_Condition.Like "a%") . should_fail_with Type_Error
range.filter (Filter_Condition.Not_Like "a_") . should_fail_with Type_Error
range.filter Filter_Condition.Is_True . should_equal []
range.filter Filter_Condition.Is_False . should_equal []
range.filter Filter_Condition.Is_Nothing . should_equal []

View File

@ -1,613 +0,0 @@
from Standard.Base import all
import Standard.Base.Data.Text.Span.Utf_16_Span
import Standard.Base.Errors.Common.Syntax_Error
import Standard.Base.Data.Text.Matching_Mode.Matching_Mode
from Standard.Base.Data.Text.Regex import No_Such_Group, Invalid_Option
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
polyglot java import java.util.regex.Pattern as Java_Pattern
from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions
default_mask = Java_Pattern.CANON_EQ.bit_or Java_Pattern.UNICODE_CASE . bit_or Java_Pattern.UNICODE_CHARACTER_CLASS
spec =
Test.group "The default regex engine's options handling" <|
Test.specify "should convert options to Java" <|
options = [Regex_Option.Comments, Regex_Option.Multiline, Default_Engine.Option.Unix_Lines]
expected_mask = Java_Pattern.UNIX_LINES.bit_or Java_Pattern.COMMENTS . bit_or Java_Pattern.MULTILINE . bit_or default_mask
actual_mask = Default_Engine.from_enso_options options
actual_mask . should_equal expected_mask
Test.specify "should specify the unicode options by default" <|
actual_mask = Default_Engine.from_enso_options []
actual_mask . should_equal default_mask
Test.specify "should handle ascii matching by disabling unicode" <|
actual_mask = Default_Engine.from_enso_options [Regex_Option.Ascii_Matching]
actual_mask . should_equal 0
Test.specify "should result in an error when an option is invalid" <|
Default_Engine.from_enso_options [""] . should_fail_with Invalid_Option
Default_Engine.from_enso_options ["", Regex_Option.Ascii_Matching] . should_fail_with Invalid_Option
Test.group "The default regex engine (Default_Engine)" <|
Test.specify "should be able to compile patterns with no options" <|
engine = Default_Engine.new
pattern = engine.compile "^a$" []
pattern.engine . should_equal engine
pattern.options . should_equal []
pattern.internal_pattern.flags . should_equal default_mask
Test.specify "should be able to compile patterns with global options" <|
engine = Default_Engine.new
pattern = engine.compile "^a$" [Regex_Option.Multiline]
pattern.engine . should_equal engine
pattern.options . should_equal [Regex_Option.Multiline]
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.MULTILINE)
Test.specify "should be able to compile patterns with engine-specific options" <|
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
pattern = engine.compile "^a$" []
pattern.engine . should_equal engine
pattern.options . should_equal [Default_Engine.Option.Literal_Pattern]
pattern.internal_pattern.flags . should_equal (default_mask.bit_or Java_Pattern.LITERAL)
Test.specify "should be able to compile patterns with combined options" <|
engine = Default_Engine.new [Default_Engine.Option.Literal_Pattern]
pattern = engine.compile "^a$" [Regex_Option.Comments]
pattern.engine . should_equal engine
pattern.options.contains Default_Engine.Option.Literal_Pattern . should_be_true
pattern.options.contains Regex_Option.Comments . should_be_true
pattern.internal_pattern.flags . should_equal (default_mask . bit_or Java_Pattern.LITERAL . bit_or Java_Pattern.COMMENTS)
Test.specify "should return a syntax error of the regex syntax is invalid" <|
engine = Default_Engine.new
engine.compile "^(a" [] . should_fail_with Syntax_Error
Test.specify "should throw an invalid options error if an option is invalid" <|
engine = Default_Engine.new
engine.compile "^a$" ["invalid"] . should_fail_with Invalid_Option
Test.specify "should escape an expression for use as a literal" <|
pattern = "http://example.com"
engine = Default_Engine.new
engine.escape pattern . should_equal "\Qhttp://example.com\E"
Test.group "The default regex engine's Pattern.matches" <|
engine = Default_Engine.new
Test.specify "should return True when the pattern matches against the input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_true
Test.specify "should return False when the pattern doesn't match against the input" <|
pattern = engine.compile "aaz" []
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_false
Test.specify "should check for full matches" <|
pattern = engine.compile "f.o" []
pattern.matches "foo" . should_be_true
pattern.matches "foobar" . should_be_false
Test.group "The default regex engine's Pattern.match" <|
engine = Default_Engine.new
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
match.group 0 . should_equal input
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "abc"
match = pattern.match input mode=Matching_Mode.First
match . should_equal Nothing
Test.specify "should be able to `match` at most N instances of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=3
match.length . should_equal 3
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
Test.specify "should `match` fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "(..)" []
input = "abcdef"
match = pattern.match input mode=5
match.length . should_equal 3
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
Test.specify "should return `Nothing` when a counted match fails" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.match input mode=3
match . should_equal Nothing
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=Regex_Mode.All
match.length . should_equal 5
match.at 0 . group 0 . should_equal "ab"
match.at 1 . group 0 . should_equal "cd"
match.at 2 . group 0 . should_equal "ef"
match.at 3 . group 0 . should_equal "gh"
match.at 4 . group 0 . should_equal "ij"
Test.specify "should return `Nothing` when an all match match fails" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.match input mode=Regex_Mode.All
match . should_equal Nothing
Test.specify "should be able to `match` the pattern against the entire input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Regex_Mode.Full
match . should_be_a Default_Engine.Match.Value
match.group 0 . should_equal input
Test.specify "should return `Nothing` if a full match does not match the entire input" <|
pattern = engine.compile "(..)" []
input = "aa ab"
full_match = pattern.match input mode=Regex_Mode.Full
full_match . should_equal Nothing
match = pattern.match input mode=Matching_Mode.First
match . should_be_a Default_Engine.Match.Value
Test.specify "should be able to `match` the pattern against bounded input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.match input mode=(Regex_Mode.Bounded 2 8)
match.length . should_equal 3
match.at 0 . group 0 . should_equal "cd"
match.at 1 . group 0 . should_equal "ef"
match.at 2 . group 0 . should_equal "gh"
Test.specify "should correctly handle empty patterns" pending="Figure out how to make Regex correctly handle empty patterns." <|
pattern = engine.compile "" []
match_1 = pattern.match "" mode=Regex_Mode.All
match_1.length . should_equal 1
match_1.at 0 . start 0 . should_equal 0
match_1.at 0 . end 0 . should_equal 0
match_2 = pattern.match "ABC" mode=Regex_Mode.All
match_2.length . should_equal 4
match_2.at 0 . start 0 . should_equal 0
match_2.at 0 . end 0 . should_equal 0
match_2.at 1 . start 0 . should_equal 1
match_2.at 1 . end 0 . should_equal 1
match_2.at 3 . start 0 . should_equal 3
match_2.at 3 . end 0 . should_equal 3
Test.group "The default regex engine's Pattern.find" <|
engine = Default_Engine.new
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.find input mode=Matching_Mode.First
match . should_be_a Text
match . should_equal "ab"
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.find input mode=Matching_Mode.First
match . should_equal Nothing
Test.specify "should be able to `find` at most N instances of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.find input mode=3
match.length . should_equal 3
match.at 0 . should_equal "ab"
match.at 1 . should_equal "cd"
match.at 2 . should_equal "ef"
Test.specify "should `find` fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "(..)" []
input = "abcdef"
match = pattern.find input mode=5
match.length . should_equal 3
match.at 0 . should_equal "ab"
match.at 1 . should_equal "cd"
match.at 2 . should_equal "ef"
Test.specify "should return `Nothing` when a counted match fails" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.find input mode=3
match . should_equal Nothing
Test.specify "should be able to `find` the all instances of the pattern in the input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.find input mode=Regex_Mode.All
match.length . should_equal 5
match.at 0 . should_equal "ab"
match.at 1 . should_equal "cd"
match.at 2 . should_equal "ef"
match.at 3 . should_equal "gh"
match.at 4 . should_equal "ij"
Test.specify "should return `Nothing` when an all match match fails" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.find input mode=Regex_Mode.All
match . should_equal Nothing
Test.specify "should be able to `find` the pattern against the entire input" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.find input mode=Regex_Mode.Full
match . should_be_a Text
match . should_equal input
Test.specify "should return `Nothing` if a full find does not match the entire input" <|
pattern = engine.compile "(..)" []
input = "aa ab"
full_match = pattern.find input mode=Regex_Mode.Full
full_match . should_equal Nothing
Test.specify "should be able to `find` the pattern against bounded input" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
match = pattern.find input mode=(Regex_Mode.Bounded 2 8)
match.length . should_equal 3
match.at 0 . should_equal "cd"
match.at 1 . should_equal "ef"
match.at 2 . should_equal "gh"
match_2 = pattern.find input mode=(Regex_Mode.Bounded 2 8 mode=10)
match_2.length . should_equal 3
match_2.at 0 . should_equal "cd"
match_2.at 1 . should_equal "ef"
match_2.at 2 . should_equal "gh"
match_3 = pattern.find input mode=(Regex_Mode.Bounded 2 8 mode=2)
match_3.length . should_equal 2
match_3.at 0 . should_equal "cd"
match_3.at 1 . should_equal "ef"
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
engine.compile "(a+|1+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
engine.compile "([a]+|[1]+)" [] . find "a1a1" . should_equal ["a", "1", "a", "1"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" . should_equal ["a", "1", "b", "2"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=5 . should_equal ["a", "1", "b", "2"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=4 . should_equal ["a", "1", "b", "2"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=3 . should_equal ["a", "1", "b"]
engine.compile "([0-9]+|[^0-9]+)" [] . find "a1b2" mode=(Regex_Mode.Bounded 1 3) . should_equal ["1", "b"]
Test.group "The default regex engine's Pattern.split" <|
engine = Default_Engine.new
Test.specify "should be able to `split` on the first instance of the pattern" <|
pattern = engine.compile "cd" []
input = "abcdefghij"
match = pattern.split input mode=Matching_Mode.First
match.length . should_equal 2
match.at 0 . should_equal "ab"
match.at 1 . should_equal "efghij"
Test.specify "should return the original text if there are no matches in first mode" <|
pattern = engine.compile "(aa)" []
input = "abcdefghij"
match = pattern.split input mode=Matching_Mode.First
match . should_equal ["abcdefghij"]
Test.specify "should be able to `split` on at most N instances of the pattern in the input" <|
pattern = engine.compile "a" []
input = "bacadaeaf"
match = pattern.split input mode=3
match.length . should_equal 4
match.at 0 . should_equal "b"
match.at 1 . should_equal "c"
match.at 2 . should_equal "d"
match.at 3 . should_equal "eaf"
Test.specify "should `split` on fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "a" []
input = "bacadaeaf"
match = pattern.split input mode=10
match.length . should_equal 5
match.at 0 . should_equal "b"
match.at 1 . should_equal "c"
match.at 2 . should_equal "d"
match.at 3 . should_equal "e"
match.at 4 . should_equal "f"
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
pattern = engine.compile "(a)" []
input = "bacadaeaf"
match = pattern.split input mode=Regex_Mode.All
match.length . should_equal 5
match.at 0 . should_equal "b"
match.at 1 . should_equal "c"
match.at 2 . should_equal "d"
match.at 3 . should_equal "e"
match.at 4 . should_equal "f"
Test.group "The default regex engine's Pattern.replace" <|
engine = Default_Engine.new
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
pattern = engine.compile "abc" []
input = "aa ab abc a bc abc"
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
match . should_be_a Text
match . should_equal "aa ab REPLACED a bc abc"
Test.specify "should return the string unchanged if there are no matches to replace in first mode" <|
pattern = engine.compile "xyz" []
input = "aa ab ac ad"
match = pattern.replace input "REPLACED" mode=Matching_Mode.First
match . should_equal input
Test.specify "should be able to replace at most N instances of the pattern in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=3
match . should_equal "REPLACED ab REPLACED ac ad REPLACED aa ax"
Test.specify "should replace fewer than N instances when there are fewer than N in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=10
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when a counted replace fails" <|
pattern = engine.compile "aa" []
input = "abcdefghij"
match = pattern.replace input "REPLACED" mode=3
match . should_equal input
Test.specify "should be able to replace the all instances of the pattern in the input" <|
pattern = engine.compile "aa" []
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when an all replace fails" <|
pattern = engine.compile "aa" []
input = "abcdefghij"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
match . should_equal input
Test.specify "should be able to replace the entire input only if it matches" <|
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
match . should_equal "REPLACED"
Test.specify "should correctly replace entire input in Full mode even if partial matches are possible" <|
pattern = engine.compile "(aa)+" []
pattern.replace "aaa" "REPLACED" mode=Regex_Mode.Full . should_equal "aaa"
pattern.replace "aaaa" "REPLACED" mode=Regex_Mode.Full . should_equal "REPLACED"
Test.specify "should return the input for a full replace if the pattern doesn't match the entire input" <|
pattern = engine.compile "(..)" []
input = "aa ab"
full_match = pattern.replace input "REPLACED" mode=Regex_Mode.Full
full_match . should_equal input
Test.specify "should not perform overlapping replacements in counted mode" <|
pattern = engine.compile "(..)" []
input = "abcdefghij"
result = pattern.replace input "REPLACED" mode=3
result . should_equal "REPLACEDREPLACEDREPLACEDghij"
Test.specify "should not perform overlapping replacements in all mode" <|
pattern = engine.compile "(..)" []
input = "aa ab"
match = pattern.replace input "REPLACED" mode=Regex_Mode.All
match . should_equal "REPLACEDREPLACEDb"
Test.specify "should handle capture groups in replacement" <|
pattern = engine.compile "(?<capture>[a-z]+)" []
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=0 . should_equal "foo bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=1 . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=2 . should_equal "[foo] [bar], baz"
pattern.replace "foo bar, baz" "[$1]" mode=3 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=4 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$1]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=Regex_Mode.All . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=0 . should_equal "foo bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=1 . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=2 . should_equal "[foo] [bar], baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=3 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=4 . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.First . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[${capture}]" mode=Matching_Mode.Last . should_equal "foo bar, [baz]"
Test.specify "should handle capture groups in replacement in All mode" <|
pattern = engine.compile "([a-z]+)" []
pattern.replace "foo bar, baz" "[$1]" mode=Regex_Mode.Full . should_equal "foo bar, baz"
pattern.replace "foo" "[$1]" mode=Regex_Mode.Full . should_equal "[foo]"
pattern_2 = engine.compile '<a href="(?<addr>.*?)">(?<name>.*?)</a>' []
pattern_2.replace '<a href="url">content</a>' "$2 <- $1" mode=Regex_Mode.Full . should_equal "content <- url"
pattern_2.replace '<a href="url">content</a>' "${name} <- ${addr}" mode=Regex_Mode.Full . should_equal "content <- url"
Test.group "Match.group" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the full match with index 0" <|
match.group 0 . should_equal "aa ab abc a bc bcd"
Test.specify "should return the group contents if it matches by index" <|
match.group 1 . should_equal "aa ab "
Test.specify "should return the group contents if it matches by name" <|
match.group "letters" . should_equal "abc a bc bcd"
Test.specify "should return Nothing if the group did not match" <|
match.group 3 . should_equal Nothing
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
match.group "fail" . should_fail_with No_Such_Group
match.group 5 . should_fail_with No_Such_Group
Test.specify "should make named groups accessible by index" <|
match.group 2 . should_equal (match.group "letters")
Test.group "Match.groups" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the results of all groups" <|
groups = match.groups
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.groups "UNMATCHED"
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
Test.group "Match.named_groups" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the results of all named groups" <|
groups = match.named_groups
groups.size . should_equal 2
groups.at "letters" . should_equal "abc a bc bcd"
groups.at "empty" . should_equal Nothing
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.named_groups "UNMATCHED"
groups.size . should_equal 2
groups.at "letters" . should_equal "abc a bc bcd"
groups.at "empty" . should_equal "UNMATCHED"
Test.group "Match.start" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the start of a group by index" <|
match.start 1 . should_equal 0
Test.specify "should return the start of a group by name" <|
match.start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <|
match.start 3 . should_equal Nothing
match.start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.start 5 . should_fail_with No_Such_Group
match.start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.end" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the end of a group by index" <|
match.end 1 . should_equal 6
Test.specify "should return the end of a group by name" <|
match.end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <|
match.end 3 . should_equal Nothing
match.end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.end 5 . should_fail_with No_Such_Group
match.end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.span" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should get the span of a group by index" <|
match.span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
Test.specify "should get the span of a group by name" <|
match.span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.span 3 . should_equal Nothing
match.span "empty" . should_equal Nothing
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.span 5 . should_fail_with No_Such_Group
match.span "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.start_position" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the region start over which self match was performed" <|
match.start_position . should_equal 0
Test.group "Match.end_position" <|
engine = Default_Engine.new
pattern = engine.compile "(.. .. )(?<letters>.+)()??(?<empty>)??" []
input = "aa ab abc a bc bcd"
match = pattern.match input mode=Matching_Mode.First
Test.specify "should be a Match" <|
match . should_be_a Default_Engine.Match.Value
Test.specify "should return the region end over which self match was performed" <|
match.end_position . should_equal 18
main = Test_Suite.run_main spec

View File

@ -1,78 +0,0 @@
from Standard.Base import all
import Standard.Base.Data.Text.Matching
import Standard.Base.Errors.Common.No_Such_Method
from Standard.Test import Test, Test_Suite, Problems
import Standard.Test.Extensions
type Foo_Error
spec = Test.group 'Matching Helper' <|
Test.specify 'should match a single name with a single Text_Matcher criterion' <|
Text_Matcher.Case_Sensitive.match_single_criterion "foo" "foo" . should_be_true
Text_Matcher.Case_Sensitive.match_single_criterion "foobar" "foo" . should_be_false
Text_Matcher.Case_Sensitive.match_single_criterion "foo" "f.*" . should_be_false
Text_Matcher.Case_Sensitive.match_single_criterion "foo" "Foo" . should_be_false
Test.specify 'should correctly handle Unicode folding with Text_Matcher matching' <|
Text_Matcher.Case_Sensitive.match_single_criterion '\u00E9' '\u0065\u{301}' . should_be_true
Text_Matcher.Case_Sensitive.match_single_criterion 'é' '\u00E9' . should_be_true
Text_Matcher.Case_Sensitive.match_single_criterion 'é' 'ę' . should_be_false
Test.specify 'should match a single name with a single regex criterion' <|
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "foo" . should_be_true
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foobar" "foo" . should_be_false
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "f.*" . should_be_true
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "foo.*" . should_be_true
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_single_criterion "foo" "F.*" . should_be_false
Test.specify 'should support case-insensitive matching' <|
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive).match_single_criterion "foo" "F.*" . should_be_true
Text_Matcher.Case_Insensitive.match_single_criterion "foO" "FOo" . should_be_true
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive).match_single_criterion "foo" "fF.*" . should_be_false
Text_Matcher.Case_Insensitive.match_single_criterion "foo" "Foos" . should_be_false
# Small beta is equal to capital 'beta' which looks the same as capital 'b' but is a different symbol.
Text_Matcher.Case_Insensitive.match_single_criterion "β" "Β" . should_be_true
Text_Matcher.Case_Insensitive.match_single_criterion "β" "B" . should_be_false
Test.specify 'should match a list of names with a list of criteria, correctly handling reordering' <|
Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "foo"] reorder=True . should_equal ["baz", "foo"]
Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "foo"] reorder=False . should_equal ["foo", "baz"]
Test.specify 'should allow multiple matches to a single criterion (Regex)' <|
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "quux"] ["b.*"] reorder=True . should_equal ["bar", "baz"]
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "quux"] ["b.*", "foo"] reorder=False . should_equal ["foo", "bar", "baz"]
Test.specify 'should include the object only with the first criterion that matched it, avoiding duplication' <|
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "zap"] [".*z.*", "b.*"] reorder=True . should_equal ["baz", "zap", "bar"]
(Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive).match_criteria ["foo", "bar", "baz", "zap"] [".*z.*", "b.*"] reorder=False . should_equal ["bar", "baz", "zap"]
Test.specify 'should correctly handle criteria which did not match anything' <|
action = Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "unknown_column"] reorder=True on_problems=_
tester = _.should_equal ["baz"]
problems = [Matching.No_Matches_Found.Error ["unknown_column"]]
Problems.test_problem_handling action problems tester
action_2 = Text_Matcher.Case_Sensitive.match_criteria ["foo", "bar", "baz"] ["baz", "unknown_column_1", "unknown_column_2"] reorder=False on_problems=_
problems_2 = [Matching.No_Matches_Found.Error ["unknown_column_1", "unknown_column_2"]]
Problems.test_problem_handling action_2 problems_2 tester
Test.specify 'should correctly work with complex object using a function extracting their names' <|
pairs = [Pair.new "foo" 42, Pair.new "bar" 33, Pair.new "baz" 10, Pair.new "foo" 0, Pair.new 10 10]
selected = [Pair.new "bar" 33, Pair.new "foo" 42, Pair.new "foo" 0]
Text_Matcher.Case_Sensitive.match_criteria pairs ["bar", "foo"] reorder=True name_mapper=_.first . should_equal selected
Text_Matcher.Case_Sensitive.match_criteria [1, 2, 3] ["2"] name_mapper=_.to_text . should_equal [2]
Test.specify 'should correctly forward errors' <|
Text_Matcher.Case_Sensitive.match_criteria (Error.throw Foo_Error) [] . should_fail_with Foo_Error
Text_Matcher.Case_Sensitive.match_criteria [] (Error.throw Foo_Error) . should_fail_with Foo_Error
(Error.throw Foo_Error).match_criteria [] [] . should_fail_with Foo_Error
Text_Matcher.Case_Sensitive.match_criteria [1, 2, 3] ["2"] name_mapper=(x-> if x == 3 then Error.throw Foo_Error else x.to_text) . should_fail_with Foo_Error
Test.expect_panic_with matcher=No_Such_Method <|
Text_Matcher.Case_Sensitive.match_criteria ["a"] ["a"] name_mapper=_.nonexistent_function
main = Test_Suite.run_main spec

View File

@ -1,487 +0,0 @@
from Standard.Base import all
import Standard.Base.Data.Text.Span.Span
import Standard.Base.Data.Text.Span.Utf_16_Span
import Standard.Base.Data.Text.Regex.Match_2.Match_2
import Standard.Base.Data.Text.Regex.Pattern_2.Pattern_2
import Standard.Base.Data.Text.Regex.Replacer.Replacer
import Standard.Base.Data.Text.Regex_2
import Standard.Base.Data.Text.Regex_2.No_Such_Group
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions
polyglot java import org.enso.base.Replacer_Cache
spec =
Test.group "Compile" <|
Test.specify "should be able to be compiled" <|
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
pattern . should_be_a Pattern_2
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
Regex_2.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
Test.specify "should disallow empty patterns in `compile`" <|
Regex_2.compile "" . should_fail_with Illegal_Argument
Test.group "Escape" <|
Test.specify "should escape an expression for use as a literal" <|
pattern = "http://example.com"
Regex_2.escape pattern . should_equal "\Qhttp://example.com\E"
Test.group "Pattern.matches" <|
Test.specify "should return True when the pattern matches against the input" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_true
Test.specify "should return False when the pattern doesn't match against the input" <|
pattern = Regex_2.compile "aaz"
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_false
Test.specify "should check for full matches" <|
pattern = Regex_2.compile "f.o"
pattern.matches "foo" . should_be_true
pattern.matches "foobar" . should_be_false
Test.specify "`matches` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.matches "ABC" . should_fail_with Illegal_Argument
Test.group "Pattern.match" <|
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
match.text 0 . should_equal input
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "abc"
match = pattern.match input
match . should_equal Nothing
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
pattern = Regex_2.compile "(..)"
input = "abcdefghij"
matches = pattern.match_all input
matches.length . should_equal 5
matches.at 0 . text 0 . should_equal "ab"
matches.at 1 . text 0 . should_equal "cd"
matches.at 2 . text 0 . should_equal "ef"
matches.at 3 . text 0 . should_equal "gh"
matches.at 4 . text 0 . should_equal "ij"
Test.specify "should return `[]` when an all match match fails" <|
pattern = Regex_2.compile "(aa)"
input = "abcdefghij"
match = pattern.match_all input
match . should_equal []
Test.specify "`match` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.match "ABC" . should_fail_with Illegal_Argument
Test.specify "`match_all` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.match_all "ABC" . should_fail_with Illegal_Argument
Test.group "Pattern_2.find and .find_all" <|
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
pattern = Regex_2.compile "(..)"
input = "abcdefghij"
match = pattern.find input
match . should_be_a Text
match . should_equal "ab"
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = Regex_2.compile "(aa)"
input = "abcdefghij"
match = pattern.find input
match . should_equal Nothing
Test.specify "should be able to `find` the all instances of the pattern in the input" <|
pattern = Regex_2.compile "(..)"
input = "abcdefghij"
match = pattern.find_all input
match.length . should_equal 5
match.at 0 . should_equal "ab"
match.at 1 . should_equal "cd"
match.at 2 . should_equal "ef"
match.at 3 . should_equal "gh"
match.at 4 . should_equal "ij"
Test.specify "should return `[]` when an all match match fails" <|
pattern = Regex_2.compile "(aa)"
input = "abcdefghij"
match = pattern.find_all input
match . should_equal []
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
Regex_2.compile "(a+|1+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
Regex_2.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
Regex_2.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"]
Test.specify "`find` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.find "ABC" . should_fail_with Illegal_Argument
Test.specify "`find_all` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.find_all "ABC" . should_fail_with Illegal_Argument
Test.group "Pattern_2.split" <|
Test.specify "should be able to `split` on the first instance of the pattern" <|
pattern = Regex_2.compile "cd"
input = "abcdefcdghij"
texts = pattern.split input only_first=True
texts . should_equal ["ab", "efcdghij"]
Test.specify "should return the original text if there are no matches in first mode" <|
pattern = Regex_2.compile "aa"
input = "abcdefghij"
texts = pattern.split input only_first=True
texts . should_equal ["abcdefghij"]
Test.specify "should return the original text if there are no matches in all mode" <|
pattern = Regex_2.compile "aa"
input = "abcdefghij"
texts = pattern.split input
texts . should_equal ["abcdefghij"]
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
pattern = Regex_2.compile "a"
pattern.split "bacadaeaf" . should_equal ["b", "c", "d", "e", "f"]
pattern.split "baab" . should_equal ["b", "", "b"]
pattern.split "aaa" . should_equal ["", "", "", ""]
pattern.split "" . should_equal [""]
pattern.split "a" . should_equal ["", ""]
pattern.split "abaca" . should_equal ["", "b", "c", ""]
Test.specify "should split without normalization" <|
pattern = Regex_2.compile "s"
pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb']
Test.group "Pattern_2.tokenize" <|
Test.specify "can tokenize with simple regexes without capturing groups"
Regex_2.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"]
Regex_2.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"]
Regex_2.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"]
Test.specify "can tokenize with regexes with capturing groups"
Regex_2.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex_2.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
Regex_2.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""]
Test.specify "ignores non-capturing groups"
Regex_2.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex_2.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex_2.compile "(?<foo>\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex_2.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
Test.specify "ignores nested groups"
Regex_2.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex_2.compile "(?<foo>\d(?<bar>\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex_2.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
Regex_2.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"]
Test.specify "non-participating groups are rendered as the empty string"
Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_4_0" . should_equal ['340']
Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_q_0" . should_equal ['3q0']
Test.specify "handles unicode" <|
Regex_2.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
# `+` only applies to the accent `\u{301}`, not to the entire grapheme.
Regex_2.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
Regex_2.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}a\u{301}']
Regex_2.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"]
Test.specify "examples are correct" <|
Regex_2.compile "..." . tokenize "ABCDEF" . should_equal ["ABC","DEF"]
Regex_2.compile "(.).(.)" . tokenize "ABCDEF" . should_equal ["AC","DF"]
Regex_2.compile "(\S+)(?:\s+|$)" . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' . should_equal ["Hello","Big","Wide","World","Goodbye!"]
Test.group "Pattern_2.replace" <|
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
pattern = Regex_2.compile "abc"
input = "aa ab abc a bc abc"
match = pattern.replace input "REPLACED" only_first=True
match . should_be_a Text
match . should_equal "aa ab REPLACED a bc abc"
Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <|
pattern = Regex_2.compile "xyz"
input = "aa ab ac ad"
match = pattern.replace input "REPLACED" only_first=True
match . should_equal input
Test.specify "should be able to replace the all instances of the pattern in the input" <|
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when an all replace fails" <|
pattern = Regex_2.compile "aa"
input = "abcdefghij"
match = pattern.replace input "REPLACED"
match . should_equal input
Test.specify "should be able to replace the entire input only if it matches" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACED"
Test.specify "should not perform overlapping replacements in all mode" <|
pattern = Regex_2.compile "(..)"
input = "aa ab"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACEDREPLACEDb"
Test.specify "should handle capture groups in replacement" <|
pattern = Regex_2.compile "(?<capture>[a-z]+)"
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$<capture>]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$<capture>]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz"
Test.specify "should handle unicode in capture group names" <|
pattern = Regex_2.compile "(?<건반>[a-z]+)"
pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]"
Text.group "should correctly evaluate documentation examples" <|
Test.specify "example 1" <|
pattern = Regex_2.compile 'aa'
pattern.replace 'aaa' 'b' . should_equal 'ba'
Test.specify "example 2" <|
pattern = Regex_2.compile '[lo]'
pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!'
Test.specify "example 3" <|
pattern = Regex_2.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!'
Test.specify "example 4" <|
pattern = Regex_2.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz'
Test.specify "example 5" <|
pattern = Regex_2.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match . should_equal "xyz ab xyz ac ad xyz xyz ax"
Test.specify "example 6" <|
pattern = Regex_2.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
Test.specify "`replace` with an empty pattern should be an error" <|
pattern = Regex_2.compile ""
pattern.replace "ABC" . should_fail_with Illegal_Argument
Test.group "Match.text" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the full match with index 0" <|
match.text 0 . should_equal "aa ab abc a bc bcd"
Test.specify "should return the group contents if it matches by index" <|
match.text 1 . should_equal "aa ab "
Test.specify "should return the group contents if it matches by name" <|
match.text "letters" . should_equal "abc a bc bcd"
Test.specify "should return Nothing if the group did not match" <|
match.text 3 . should_equal Nothing
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
match.text "fail" . should_fail_with No_Such_Group
match.text 5 . should_fail_with No_Such_Group
Test.specify "should make named groups accessible by index" <|
match.text 2 . should_equal (match.text "letters")
Test.group "Match.groups" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the results of all groups" <|
groups = match.groups
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.groups "UNMATCHED"
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
Test.group "Match.named_groups" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2.Value
Test.specify "should return the results of all named groups" <|
groups = match.named_groups
groups.keys.sort . should_equal ["empty", "letters"]
groups.size . should_equal 2
groups.at "letters" . should_equal "abc a bc bcd"
groups.at "empty" . should_equal Nothing
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.named_groups "UNMATCHED"
groups.size . should_equal 2
groups.at "letters" . should_equal "abc a bc bcd"
groups.at "empty" . should_equal "UNMATCHED"
Test.group "Match.start" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the start of a group by index" <|
match.start 1 . should_equal 0
Test.specify "should return the start of a group by name" <|
match.start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <|
match.start 3 . should_equal Nothing
match.start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.start 5 . should_fail_with No_Such_Group
match.start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.end" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the end of a group by index" <|
match.end 1 . should_equal 6
Test.specify "should return the end of a group by name" <|
match.end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <|
match.end 3 . should_equal Nothing
match.end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.end 5 . should_fail_with No_Such_Group
match.end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_start" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the start of a group by index" <|
match.utf_16_start 1 . should_equal 0
Test.specify "should return the start of a group by name" <|
match.utf_16_start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_start 3 . should_equal Nothing
match.utf_16_start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_start 5 . should_fail_with No_Such_Group
match.utf_16_start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_end" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should return the end of a group by index" <|
match.utf_16_end 1 . should_equal 6
Test.specify "should return the end of a group by name" <|
match.utf_16_end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_end 3 . should_equal Nothing
match.utf_16_end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_end 5 . should_fail_with No_Such_Group
match.utf_16_end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.span" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should get the span of a group by index" <|
match.span 1 . should_equal (Span.Value (0.up_to 6) input)
Test.specify "should get the span of a group by name" <|
match.span "letters" . should_equal (Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.span 3 . should_equal Nothing
match.span "empty" . should_equal Nothing
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.span 5 . should_fail_with No_Such_Group
match.span "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_span" <|
pattern = Regex_2.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match_2
Test.specify "should get the UTF16 span of a group by index" <|
match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
Test.specify "should get the UTF16 span of a group by name" <|
match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_span 3 . should_equal Nothing
match.utf_16_span "empty" . should_equal Nothing
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_span 5 . should_fail_with No_Such_Group
match.utf_16_span "nonexistent" . should_fail_with No_Such_Group
Test.group "caching" <|
Test.specify "Replacer cache drops old values" <|
pattern = Regex_2.compile('([a-c])')
# Add enough values to flush out the first values.
0.up_to get_lru_size+1 . map i->
result = pattern.replace "abcdef" ("$1$1x" + i.to_text)
result . should_not_equal Nothing
replacer_cache_lookup "$1$1x0" . should_equal Nothing
replacer_cache_lookup "$1$1x1" . should_not_equal Nothing
main = Test_Suite.run_main spec

View File

@ -1,30 +1,507 @@
from Standard.Base import all
import Standard.Base.Data.Text.Span.Span
import Standard.Base.Data.Text.Span.Utf_16_Span
import Standard.Base.Data.Text.Regex
import Standard.Base.Data.Text.Regex.Match.Match
import Standard.Base.Data.Text.Regex.No_Such_Group
import Standard.Base.Data.Text.Regex.Pattern.Pattern
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
import Standard.Base.Data.Text.Regex.Replacer.Replacer
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions
polyglot java import org.enso.base.Replacer_Cache
spec =
Test.group "Regex options handling" <|
Test.specify "should work properly with flag options" <|
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[]
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
Test.specify "should properly override vector options" <|
flags = Regex.from_flags match_ascii=True case_insensitive=Nothing dot_matches_newline=True multiline=False comments=True extra_opts=[Regex_Option.Multiline, Regex_Option.Case_Insensitive]
flags . should_equal [Regex_Option.Ascii_Matching, Regex_Option.Case_Insensitive, Regex_Option.Dot_Matches_Newline, Regex_Option.Comments]
Test.group "Regexes" <|
Test.group "Compile" <|
Test.specify "should be able to be compiled" <|
pattern = Regex.compile "(?<dots>..)" case_insensitive=True
pattern . should_be_a Default_Engine.Pattern.Value
pattern.options . should_equal [Regex_Option.Case_Insensitive]
pattern . should_be_a Pattern
Test.specify "should be able to be escaped" <|
pattern = "http://example.com"
Regex.escape pattern . should_equal "\Qhttp://example.com\E"
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
Regex.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
## TODO: Missing tests for No_Such_Group_Error
Test.specify "should disallow empty patterns in `compile`" <|
Regex.compile "" . should_fail_with Illegal_Argument
Test.group "Escape" <|
Test.specify "should escape an expression for use as a literal" <|
Regex.escape "[a-z\d]+" . should_equal '\\[a-z\\d\\]\\+'
Test.group "Pattern.matches" <|
Test.specify "should return True when the pattern matches against the input" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_true
Test.specify "should return False when the pattern doesn't match against the input" <|
pattern = Regex.compile "aaz"
input = "aa ab abc a bc bcd"
pattern.matches input . should_be_false
Test.specify "should check for full matches" <|
pattern = Regex.compile "f.o"
pattern.matches "foo" . should_be_true
pattern.matches "foobar" . should_be_false
Test.specify "`matches` with an empty pattern should be an error" <|
pattern = Regex.compile ""
pattern.matches "ABC" . should_fail_with Illegal_Argument
Test.specify "`matches` against a non-Text should fail with Illegal_Argument" <|
pattern = Regex.compile "abc"
pattern.matches 1 . should_fail_with Type_Error
Test.group "Pattern.match and .match_all" <|
Test.specify "should be able to `match` the first instance of the pattern in the input" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
match.text 0 . should_equal input
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "abc"
match = pattern.match input
match . should_equal Nothing
Test.specify "should be able to `match` the all instances of the pattern in the input" <|
pattern = Regex.compile "(..)"
input = "abcdefghij"
matches = pattern.match_all input
matches.length . should_equal 5
matches.at 0 . text 0 . should_equal "ab"
matches.at 1 . text 0 . should_equal "cd"
matches.at 2 . text 0 . should_equal "ef"
matches.at 3 . text 0 . should_equal "gh"
matches.at 4 . text 0 . should_equal "ij"
Test.specify "should return `[]` when an all match match fails" <|
pattern = Regex.compile "(aa)"
input = "abcdefghij"
match = pattern.match_all input
match . should_equal []
Test.specify "`match` with an empty pattern should be an error" <|
pattern = Regex.compile ""
pattern.match "ABC" . should_fail_with Illegal_Argument
Test.specify "`match_all` with an empty pattern should be an error" <|
pattern = Regex.compile ""
pattern.match_all "ABC" . should_fail_with Illegal_Argument
Test.specify "`match` against a non-Text should fail with Illegal_Argument" <|
pattern = Regex.compile "abc"
pattern.match 1 . should_fail_with Type_Error
Test.specify "`match_all` against a non-Text should fail with Illegal_Argument" <|
pattern = Regex.compile "abc"
pattern.match_all 1 . should_fail_with Type_Error
Test.group "Pattern.find and .find_all" <|
Test.specify "should be able to `find` the first instance of the pattern in the input" <|
pattern = Regex.compile "(..)"
input = "abcdefghij"
match = pattern.find input
match . should_be_a Text
match . should_equal "ab"
Test.specify "should return `Nothing` if there are no matches in first mode" <|
pattern = Regex.compile "(aa)"
input = "abcdefghij"
match = pattern.find input
match . should_equal Nothing
Test.specify "should be able to `find` the all instances of the pattern in the input" <|
pattern = Regex.compile "(..)"
input = "abcdefghij"
match = pattern.find_all input
match.length . should_equal 5
match.at 0 . should_equal "ab"
match.at 1 . should_equal "cd"
match.at 2 . should_equal "ef"
match.at 3 . should_equal "gh"
match.at 4 . should_equal "ij"
Test.specify "should return `[]` when an all match match fails" <|
pattern = Regex.compile "(aa)"
input = "abcdefghij"
match = pattern.find_all input
match . should_equal []
Test.specify "should correctly handle edge cases where one-letter matches happen at the end of the word" <|
Regex.compile "(a+|1+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
Regex.compile "([a]+|[1]+)" . find_all "a1a1" . should_equal ["a", "1", "a", "1"]
Regex.compile "([0-9]+|[^0-9]+)" . find_all "a1b2" . should_equal ["a", "1", "b", "2"]
Test.specify "`find` with an empty pattern should be an error" <|
pattern = Regex.compile ""
pattern.find "ABC" . should_fail_with Illegal_Argument
Test.specify "`find_all` with an empty pattern should be an error" <|
pattern = Regex.compile ""
pattern.find_all "ABC" . should_fail_with Illegal_Argument
Test.group "Pattern.split" <|
Test.specify "should be able to `split` on the first instance of the pattern" <|
pattern = Regex.compile "cd"
input = "abcdefcdghij"
texts = pattern.split input only_first=True
texts . should_equal ["ab", "efcdghij"]
Test.specify "should return the original text if there are no matches in first mode" <|
pattern = Regex.compile "aa"
input = "abcdefghij"
texts = pattern.split input only_first=True
texts . should_equal ["abcdefghij"]
Test.specify "should return the original text if there are no matches in all mode" <|
pattern = Regex.compile "aa"
input = "abcdefghij"
texts = pattern.split input
texts . should_equal ["abcdefghij"]
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
pattern = Regex.compile "a"
pattern.split "bacadaeaf" . should_equal ["b", "c", "d", "e", "f"]
pattern.split "baab" . should_equal ["b", "", "b"]
pattern.split "aaa" . should_equal ["", "", "", ""]
pattern.split "" . should_equal [""]
pattern.split "a" . should_equal ["", ""]
pattern.split "abaca" . should_equal ["", "b", "c", ""]
Test.specify "should split without normalization" <|
pattern = Regex.compile "s"
pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb']
Test.specify "`split` against a non-Text should fail with Illegal_Argument" <|
pattern = Regex.compile "abc"
pattern.split 1 . should_fail_with Type_Error
Test.group "Pattern.tokenize" <|
Test.specify "can tokenize with simple regexes without capturing groups"
Regex.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"]
Regex.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"]
Regex.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"]
Test.specify "can tokenize with regexes with capturing groups"
Regex.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
Regex.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""]
Test.specify "ignores non-capturing groups"
Regex.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex.compile "(?<foo>\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
Test.specify "ignores nested groups"
Regex.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex.compile "(?<foo>\d(?<bar>\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
Regex.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
Regex.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"]
Test.specify "non-participating groups are rendered as the empty string"
Regex.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_4_0" . should_equal ['340']
Regex.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_q_0" . should_equal ['3q0']
Test.specify "handles unicode" <|
Regex.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
# `+` only applies to the accent `\u{301}`, not to the entire grapheme.
Regex.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
Regex.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}a\u{301}']
Regex.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"]
Test.specify "examples are correct" <|
Regex.compile "..." . tokenize "ABCDEF" . should_equal ["ABC","DEF"]
Regex.compile "(.).(.)" . tokenize "ABCDEF" . should_equal ["AC","DF"]
Regex.compile "(\S+)(?:\s+|$)" . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' . should_equal ["Hello","Big","Wide","World","Goodbye!"]
Test.group "Pattern.replace" <|
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
pattern = Regex.compile "abc"
input = "aa ab abc a bc abc"
match = pattern.replace input "REPLACED" only_first=True
match . should_be_a Text
match . should_equal "aa ab REPLACED a bc abc"
Test.specify "should return the string unchanged if there are no matches to replace in only_first mode" <|
pattern = Regex.compile "xyz"
input = "aa ab ac ad"
match = pattern.replace input "REPLACED" only_first=True
match . should_equal input
Test.specify "should be able to replace the all instances of the pattern in the input" <|
pattern = Regex.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACED ab REPLACED ac ad REPLACED REPLACED ax"
Test.specify "should return the input when an all replace fails" <|
pattern = Regex.compile "aa"
input = "abcdefghij"
match = pattern.replace input "REPLACED"
match . should_equal input
Test.specify "should be able to replace the entire input only if it matches" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACED"
Test.specify "should not perform overlapping replacements in all mode" <|
pattern = Regex.compile "(..)"
input = "aa ab"
match = pattern.replace input "REPLACED"
match . should_equal "REPLACEDREPLACEDb"
Test.specify "should handle capture groups in replacement" <|
pattern = Regex.compile "(?<capture>[a-z]+)"
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$1]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$<capture>]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$<capture>]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$0]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$0]" only_first=True . should_equal "[foo] bar, baz"
pattern.replace "foo bar, baz" "[$&]" . should_equal "[foo] [bar], [baz]"
pattern.replace "foo bar, baz" "[$&]" only_first=True . should_equal "[foo] bar, baz"
Test.specify "should handle unicode in capture group names" <|
pattern = Regex.compile "(?<건반>[a-z]+)"
pattern.replace "foo bar, baz" "[$<건반>]" . should_equal "[foo] [bar], [baz]"
Text.group "should correctly evaluate documentation examples" <|
Test.specify "example 1" <|
pattern = Regex.compile 'aa'
pattern.replace 'aaa' 'b' . should_equal 'ba'
Test.specify "example 2" <|
pattern = Regex.compile '[lo]'
pattern.replace 'Hello World!' '#' . should_equal 'He### W#r#d!'
Test.specify "example 3" <|
pattern = Regex.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True . should_equal 'He#lo World!'
Test.specify "example 4" <|
pattern = Regex.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' . should_equal '(abc) foo (bar) baz'
Test.specify "example 5" <|
pattern = Regex.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match . should_equal "xyz ab xyz ac ad xyz xyz ax"
Test.specify "example 6" <|
pattern = Regex.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" . should_equal "[foo] [bar], [baz]"
Test.specify "`replace` with an empty pattern should be an error" <|
pattern = Regex.compile ""
pattern.replace "ABC" . should_fail_with Illegal_Argument
Test.specify "`replace` against a non-Text should fail with Illegal_Argument" <|
pattern = Regex.compile "abc"
pattern.replace 1 "abc" . should_fail_with Type_Error
Test.group "Match.text" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should return the full match with index 0" <|
match.text 0 . should_equal "aa ab abc a bc bcd"
Test.specify "should return the group contents if it matches by index" <|
match.text 1 . should_equal "aa ab "
Test.specify "should return the group contents if it matches by name" <|
match.text "letters" . should_equal "abc a bc bcd"
Test.specify "should return Nothing if the group did not match" <|
match.text 3 . should_equal Nothing
Test.specify "should fail with No_Such_Group_Error if the group did not exist" <|
match.text "fail" . should_fail_with No_Such_Group
match.text 5 . should_fail_with No_Such_Group
Test.specify "should make named groups accessible by index" <|
match.text 2 . should_equal (match.text "letters")
Test.group "Match.groups" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should return the results of all groups" <|
groups = match.groups
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", Nothing, Nothing]
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.groups "UNMATCHED"
groups.length . should_equal 5
groups.should_equal ["aa ab abc a bc bcd", "aa ab ", "abc a bc bcd", "UNMATCHED", "UNMATCHED"]
Test.group "Match.named_groups" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match.Value
Test.specify "should return the results of all named groups" <|
groups = match.named_groups
groups.keys.sort . should_equal ["empty", "letters"]
groups.size . should_equal 2
groups.at "letters" . should_equal "abc a bc bcd"
groups.at "empty" . should_equal Nothing
Test.specify "should replace unmatched groups by a user-specified value" <|
groups = match.named_groups "UNMATCHED"
groups.size . should_equal 2
groups.at "letters" . should_equal "abc a bc bcd"
groups.at "empty" . should_equal "UNMATCHED"
Test.group "Match.start" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should return the start of a group by index" <|
match.start 1 . should_equal 0
Test.specify "should return the start of a group by name" <|
match.start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <|
match.start 3 . should_equal Nothing
match.start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.start 5 . should_fail_with No_Such_Group
match.start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.end" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should return the end of a group by index" <|
match.end 1 . should_equal 6
Test.specify "should return the end of a group by name" <|
match.end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <|
match.end 3 . should_equal Nothing
match.end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.end 5 . should_fail_with No_Such_Group
match.end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_start" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should return the start of a group by index" <|
match.utf_16_start 1 . should_equal 0
Test.specify "should return the start of a group by name" <|
match.utf_16_start "letters" . should_equal 6
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_start 3 . should_equal Nothing
match.utf_16_start "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_start 5 . should_fail_with No_Such_Group
match.utf_16_start "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_end" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should return the end of a group by index" <|
match.utf_16_end 1 . should_equal 6
Test.specify "should return the end of a group by name" <|
match.utf_16_end "letters" . should_equal 18
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_end 3 . should_equal Nothing
match.utf_16_end "empty" . should_equal Nothing
Test.specify "should return No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_end 5 . should_fail_with No_Such_Group
match.utf_16_end "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.span" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should get the span of a group by index" <|
match.span 1 . should_equal (Span.Value (0.up_to 6) input)
Test.specify "should get the span of a group by name" <|
match.span "letters" . should_equal (Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.span 3 . should_equal Nothing
match.span "empty" . should_equal Nothing
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.span 5 . should_fail_with No_Such_Group
match.span "nonexistent" . should_fail_with No_Such_Group
Test.group "Match.utf_16_span" <|
pattern = Regex.compile "(.. .. )(?<letters>.+)()??(?<empty>)??"
input = "aa ab abc a bc bcd"
match = pattern.match input
match . should_be_a Match
Test.specify "should get the UTF16 span of a group by index" <|
match.utf_16_span 1 . should_equal (Utf_16_Span.Value (0.up_to 6) input)
Test.specify "should get the UTF16 span of a group by name" <|
match.utf_16_span "letters" . should_equal (Utf_16_Span.Value (6.up_to 18) input)
Test.specify "should return Nothing if the group didn't match" <|
match.utf_16_span 3 . should_equal Nothing
match.utf_16_span "empty" . should_equal Nothing
Test.specify "should fail with a No_Such_Group_Error if the group doesn't exist" <|
match.utf_16_span 5 . should_fail_with No_Such_Group
match.utf_16_span "nonexistent" . should_fail_with No_Such_Group
Test.group "caching" <|
Test.specify "Replacer cache drops old values" <|
pattern = Regex.compile('([a-c])')
# Add enough values to flush out the first values.
0.up_to get_lru_size+1 . map i->
result = pattern.replace "abcdef" ("$1$1x" + i.to_text)
result . should_not_equal Nothing
replacer_cache_lookup "$1$1x0" . should_equal Nothing
replacer_cache_lookup "$1$1x1" . should_not_equal Nothing
main = Test_Suite.run_main spec

View File

@ -1,6 +1,6 @@
from Standard.Base import all
import Standard.Base.Data.Text.Regex_2.No_Such_Group
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
import Standard.Base.Data.Text.Regex.No_Such_Group
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
import Standard.Base.Data.Text.Span.Span
import Standard.Base.Data.Text.Span.Utf_16_Span
import Standard.Base.Errors.Common.Index_Out_Of_Bounds
@ -9,8 +9,6 @@ import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.IO
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
from Standard.Base.Data.Text.Text_Sub_Range.Text_Sub_Range import all
from Standard.Base.Data.Index_Sub_Range.Index_Sub_Range import all

View File

@ -223,7 +223,6 @@ spec = Test.group "Vectors" <|
["abab", "aaabaaaa"].filter (Filter_Condition.Like "_ba_") . should_equal ["abab"]
["abab", "aaabaaaa"].filter (Filter_Condition.Like "%ba__%") . should_equal ["aaabaaaa"]
["aaaa", "bbbbb", "[ab]aaaa"].filter (Filter_Condition.Like "[ab]%") . should_equal ["[ab]aaaa"]
["a\Qa\Eabb", "aaabb"].filter (Filter_Condition.Like "_\Qa\Ea%") . should_equal ["a\Qa\Eabb"]
["f.txt", "abc.*"].filter (Filter_Condition.Like "%.*") . should_equal ["abc.*"]
["f.txt", "abc.*"].filter (Filter_Condition.Not_Like "%.*") . should_equal ["f.txt"]

View File

@ -50,9 +50,7 @@ import project.Data.Regression_Spec
import project.Data.Text_Spec
import project.Data.Text.Text_Sub_Range_Spec
import project.Data.Text.Default_Regex_Engine_Spec
import project.Data.Text.Encoding_Spec
import project.Data.Text.Matching_Spec
import project.Data.Text.Regex_Spec
import project.Data.Text.Span_Spec
import project.Data.Text.Utils_Spec
@ -126,10 +124,8 @@ main = Test_Suite.run_main <|
Problems_Spec.spec
Range_Spec.spec
Ref_Spec.spec
Lazy_Spec.spec
Default_Regex_Engine_Spec.spec
Regex_Spec.spec
Matching_Spec.spec
Lazy_Spec.spec
Runtime_Spec.spec
Self_Type_Spec.spec
Span_Spec.spec