Combine Regex and Pattern (#7172)

Merge Pattern into Regex.
This commit is contained in:
GregoryTravis 2023-07-05 09:51:53 -04:00 committed by GitHub
parent 72a6c54dfc
commit 966f8b773a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 496 additions and 489 deletions

View File

@ -1,7 +1,7 @@
import project.Any.Any
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
from project.Data.Text.Extensions import all
import project.Data.Text.Regex
import project.Data.Text.Regex.Regex
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Error.Error

View File

@ -1,5 +1,5 @@
import project.Data.Locale.Locale
import project.Data.Text.Regex
import project.Data.Text.Regex.Regex
import project.Data.Text.Text
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument

View File

@ -9,7 +9,7 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Data.Text.Encoding.Encoding
import project.Data.Text.Location.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex
import project.Data.Text.Regex.Regex
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.Regex_Syntax_Error
import project.Data.Text.Span.Span

View File

@ -1,59 +1,411 @@
import project.Any.Any
import project.Errors.Common.Syntax_Error
import project.Data.Filter_Condition.Filter_Condition
import project.Data.Map.Map
import project.Data.Numbers.Integer
import project.Data.Range.Range
import project.Data.Text.Helpers
import project.Data.Text.Prim_Text_Helper
import project.Data.Text.Regex.Pattern.Pattern
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Regex.Internal.Match_Iterator.Match_Iterator
import project.Data.Text.Regex.Internal.Match_Iterator.Match_Iterator_Value
import project.Data.Text.Regex.Internal.Replacer.Replacer
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Common.Type_Error
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Math
import project.Meta
import project.Nothing.Nothing
import project.Panic.Panic
import project.Polyglot.Polyglot
from project.Data.Boolean import Boolean, True, False
from project.Errors.Common import Syntax_Error
from project.Data.Index_Sub_Range import sort_and_merge_ranges
from project.Data.Range.Extensions import all
from project.Data.Text.Extensions import all
polyglot java import org.enso.base.Replacer_Cache
polyglot java import org.enso.base.Text_Utils
polyglot java import org.enso.base.Regex_Utils
## Compile the provided `expression` into a regex pattern that can be used for
matching.
type Regex
## Compile the provided `expression` into a `Regex` that can be used for
matching.
Arguments
- expression: The text representing the regular expression that you want to
compile. Must be non-empty.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
Arguments
- expression: The text representing the regular expression that you want to
compile. Must be non-empty.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
If an empty regex is used, `compile` throws an Illegal_Argument error.
If an empty regex is used, `compile` throws an Illegal_Argument error.
compile : Text -> Boolean | Nothing -> Regex ! Regex_Syntax_Error | Illegal_Argument
compile expression case_insensitive=Nothing =
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
options_string = if case_insensitive == True then "usgi" else "usg"
? Why Compile?
While many regex engines are able to cache ad-hoc patterns, it is often
useful to be able to manually retain a pattern that you have computed. This
function exists so you can hold onto the resultant `Pattern` object,
instead of immediately proceeding to match using it.
compile : Text -> Boolean | Nothing -> Pattern ! Regex_Syntax_Error | Illegal_Argument
compile self expression case_insensitive=Nothing =
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
options_string = if case_insensitive == True then "usgi" else "usg"
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
Regex.Value internal_regex_object
Pattern.Value internal_regex_object
## PRIVATE
## Escape the special characters in `expression` such that the result is a
valid literal pattern for the original string.
internal_regex_object : RegexObject (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (internal_regex_object : Any)
Arguments:
- expression: The expression to escape metacharacters in.
## Returns `True` if the input matches against the pattern described by
`self`, otherwise `False`.
> Example
Turn a Text into a regex that matches that string exactly.
Arguments:
- input: The text to check for matching.
matches : Text -> Boolean | Type_Error
matches self input =
Helpers.expect_text input <|
m = self.internal_regex_object.exec input 0
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
example_escape =
literal_string = "\!\.|abcde"
Regex.escape literal_string
escape : Text -> Text
escape self expression = Regex_Utils.regexQuote expression
## Tries to match the provided `input` against the pattern `self`.
Returns a `Match` containing the matched text and its match groups, or
`Nothing` if the match failed.
Arguments:
- input: The text to match the pattern described by `self` against.
match : Text -> Match | Nothing | Type_Error
match self input =
Helpers.expect_text input <|
it = Match_Iterator.new self input
case it.next of
Match_Iterator_Value.Next _ match _ -> match
Match_Iterator_Value.Last _ -> Nothing
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Match` object, each containing the matched text
and its match groups.
Arguments:
- input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match ! Type_Error | Illegal_Argument
match_all self input =
Helpers.expect_text input <|
pattern_is_empty = self.internal_regex_object.pattern == ''
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
builder = Vector.new_builder
it = Match_Iterator.new self input
go it = case it.next of
Match_Iterator_Value.Next _ match next_it ->
builder.append match
@Tail_Call go next_it
Match_Iterator_Value.Last _ -> Nothing
go it
builder.to_vector
## Tries to match the provided `input` against the pattern `self`.
Returns a `Text` containing the matched text, or `Nothing` if the match
failed.
Arguments:
- input: The text to match the pattern described by `self` against.
find : Text -> Text | Nothing | Type_Error
find self input =
Helpers.expect_text input <|
match_to_group_maybe <| self.match input
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Text`, each containing the matched text.
If the pattern does not match, an empty `Vector` is returned.
Arguments:
- input: The text to match the pattern described by `self` against.
find_all : Text -> Vector Text ! Type_Error
find_all self input =
Helpers.expect_text input <|
self.match_all input . map match_to_group_maybe
## Splits the `input` text based on the pattern described by `self`.
Arguments:
- input: The text to split based on the pattern described by `self`.
- only_first: If true, only split at the first occurrence.
This method will _always_ return a vector. If no splits take place, the
vector will contain a single element (equal to the original string).
> Example
Split on the first instance of the pattern.
pattern = Regex.compile "cd"
input = "abcdefcdghij"
texts = pattern.split input only_first=True
texts . should_equal ["ab", "efcdghij"]
> Example
Split on the all instances of the pattern in the input.
pattern = Regex.compile "a"
input = "bacadaeaf"
texts = pattern.split input
texts . should_equal ["b", "c", "d", "e", "f"]
> Example
Returns the original text if there are no matches.
pattern = Regex.compile "aa"
input = "abcdefghij"
texts = pattern.split input
texts . should_equal ["abcdefghij"]
split : Text -> Boolean -> Vector Text | Type_Error
split self input only_first=False =
Helpers.expect_text input <|
builder = Vector.new_builder
it = Match_Iterator.new self input
go next = case next of
Match_Iterator_Value.Next filler _ next_it ->
builder.append filler.text
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next
Match_Iterator_Value.Last filler ->
builder.append filler.text
go it.next
builder.to_vector
## Takes an input string and returns all the matches as a `Vector Text`.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned. Non-participating
groups are omitted.
Arguments:
- input: The text to tokenize.
> Example
Split to blocks of 3 characters.
Regex.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
> Example
Split to blocks of 3 characters taking first and third letters.
Regex.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
> Example
Split a text on any white space.
Regex.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
== ['Hello','Big','Wide','World','Goodbye!']
tokenize : Text -> Vector Text
tokenize self input =
self.match_all input . map (build_tokenization_output_from_match self _)
## Replace all occurrences of the pattern described by `self` in the `input`
with the specified `replacement`.
Arguments:
- input: The text in which to perform the replacement(s).
- replacement: The literal text with which to replace any matches.
- only_first: If True, only replace the first match.
If this method performs no replacements it will return the `input` text
unchanged.
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
> Example
Replace letters in the text "aa".
pattern = Regex.compile 'aa'
pattern.replace 'aaa' 'b' == 'ba'
> Example
Replace all occurrences of letters 'l' and 'o' with '#'.
pattern = Regex.compile '[lo]'
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
> Example
Replace the first occurrence of letter 'l' with '#'.
pattern = Regex.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
> Example
Replace texts in quotes with parentheses.
pattern = Regex.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
> Example
Replace a literal string with a replacement value.
pattern = Regex.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match == "xyz ab xyz ac ad xyz xyz ax"
> Example
Replace each word with the same word surrounded by `[]`.
pattern = Regex.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
replace : Text -> Text -> Boolean -> Text | Type_Error
replace self input replacement only_first=False =
Helpers.expect_text input <|
it = Match_Iterator.new self input
case it of
Match_Iterator_Value.Last filler -> filler.text
_ ->
replacer = Replacer.new replacement self
replacer.if_not_error <|
go next current = case next of
Match_Iterator_Value.Next filler match next_it ->
new_value = current + filler.text + (replacer.replace match)
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next new_value
Match_Iterator_Value.Last filler ->
current + filler.text
go it.next ""
## PRIVATE
Look up a match group name or number, and check that it is valid.
Arguments:
- id: The name or number of the group that was asked for.
Returns: a group number.
A group number is invalid if it is outside the range of groups
that were in the original pattern.
A group name is invalid if it was not defined in the original pattern.
A group name is an alias for a group number; if a name is passed to
this method, it returns the corresponding group number.
If a group number is passed to `lookup_group` and it is valid, it will
simply return the group number.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Regex.lookup_group 3) will return 3. If the caller tries to get group 3,
Match.group will return Nothing.
lookup_group : Integer | Text -> Integer ! No_Such_Group
lookup_group self id =
case id of
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
True -> n
False -> Error.throw (No_Such_Group.Error n)
name : Text ->
# Maps name to number
groups = self.internal_regex_object.groups
n = case groups of
# If Nothing, there are no named groups
Nothing -> Error.throw (No_Such_Group.Error name)
_ -> read_group_map groups name
case n of
_ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name)
## PRIVATE
Return a lazy iterator over matches against a string.
Arguments
- text: the string to match against.
iterator : Text -> Match_Iterator
iterator self input = Match_Iterator.new self input
## Return the number of groups in the underlying RegexObject.
Note, the count includes group 0 (the whole match) as well.
group_count : Integer
group_count self = self.internal_regex_object.groupCount
## Return a vector of all named group names.
named_groups : Vector Text
named_groups self =
map = polyglot_map_to_map self.internal_regex_object.groups
map.keys
## Return a map from group number to group name. Only includes named groups.
group_nums_to_names : Map Integer Text
group_nums_to_names self =
map = polyglot_map_to_map self.internal_regex_object.groups
map.transform k-> v-> [v, k]
## Escape the special characters in `expression` such that the result is a
valid literal pattern for the original string.
Arguments:
- expression: The expression to escape metacharacters in.
> Example
Turn a Text into a regex that matches that string exactly.
example_escape =
literal_string = "\!\.|abcde"
Regex.escape literal_string
escape : Text -> Text
escape expression = Regex_Utils.regexQuote expression
## PRIVATE
Convert the polyglot map to a Map.
polyglot_map_to_map : Any -> Map Any Any
polyglot_map_to_map map =
polyglot_keys = Polyglot.get_members map
keys = Vector.from_polyglot_array polyglot_keys
pairs = keys.map key-> [key, Polyglot.get_member map key]
Map.from_vector pairs
## PRIVATE
Get the named group from the polyglot map.
read_group_map : Any -> Text -> Integer | Nothing
read_group_map polyglot_map name =
map = polyglot_map_to_map polyglot_map
map.get name
## PRIVATE
match_to_group_maybe : Match | Nothing -> Text | Nothing
match_to_group_maybe match =
if match.is_nothing then Nothing else match.text 0
## PRIVATE
Build an output string from a Match resulting from `tokenize`.
See `tokenize`.
build_tokenization_output_from_match : Regex -> Match -> Text
build_tokenization_output_from_match pattern match =
if pattern.group_count == 1 then match.text 0 else
# Extract the ranges of the spans of all capturing groups
group_numbers = 1.up_to pattern.group_count
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
ranges = spans.map span-> case span of Span.Value range _ -> range
# Eliminate nested capturing groups by sorting and merging the ranges.
top_level_ranges = sort_and_merge_ranges ranges
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
text_all = case spans.at 0 of Span.Value _ text -> text
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
top_level_spans.map (.text) . join
## An error that is emitted when there is no such group in the match for the
provided `id`.

View File

@ -0,0 +1,84 @@
import project.Errors.Common.Syntax_Error
import project.Data.Numbers.Integer
import project.Data.Range.Range
import project.Data.Text.Prim_Text_Helper
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.Regex
import project.Data.Text.Regex.Regex_Syntax_Error
import project.Data.Text.Text
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
from project.Data.Range.Extensions import all
from project.Data.Text.Extensions import all
polyglot java import org.enso.base.Text_Utils
polyglot java import org.enso.base.Regex_Utils
## PRIVATE
Performs the regex match, and iterates through the results. Yields both
the matched parts of the string, and the 'filler' parts between them.
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
matches and replacement boundaries can fall in the middle of multi-
character graphemes, thereby splitting them apart.
At each step, it yields a Match_Iterator_Value, whivch has either a filler
and a match, or just the final filler. A Match_Iterator_Value.Last value is
return at the end, and only at the end.
Optionally, you can call `early_exit` to have it return the remainder of
the string, unmatched, as a single Last value. (Used for `replace` with
`only_first=True`.)
type Match_Iterator
## PRIVATE
new : Regex -> Text -> Match_Iterator
new pattern input = Match_Iterator.Value pattern input 0
## PRIVATE
Value (pattern : Regex) (input : Text) (cursor : Integer)
## PRIVATE
Return the next match, or the last filler string if there is no
additional match.
Also returns the next iterator, if there was a match.
next : Match_Iterator_Value
next self =
regex_result = if self.cursor > self.input.char_vector.length then Nothing else self.pattern.internal_regex_object.exec self.input self.cursor
case regex_result.is_nothing.not && regex_result.isMatch of
False ->
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = (Utf_16_Span.Value filler_range self.input)
Match_Iterator_Value.Last filler_span
True ->
match_start = regex_result.getStart 0
filler_range = Range.new self.cursor match_start
filler_span = (Utf_16_Span.Value filler_range self.input)
match = Match.Value self.pattern regex_result self.input
## Handle edge case where match is 0 length
next_cursor = (self.cursor + 1).max (match.utf_16_end 0)
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
Match_Iterator_Value.Next filler_span match next_iterator
## PRIVATE
Returns the remainder of the string, unmatched.
early_exit : Match_Iterator_Value
early_exit self =
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = Utf_16_Span.Value filler_range self.input
Match_Iterator_Value.Last filler_span
## PRIVATE
type Match_Iterator_Value
## PRIVATE
Next (filler : Utf_16_Span) (match : Match) (next_iterator : Match_Iterator)
## PRIVATE
Last (filler : Utf_16_Span)

View File

@ -1,10 +1,9 @@
import project.Data.Numbers.Integer
from project.Data.Text.Extensions import all
import project.Data.Text.Regex
import project.Data.Text.Regex.Regex
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Regex.Pattern.Match_Iterator_Value
import project.Data.Text.Regex.Pattern.Pattern
import project.Data.Text.Regex.Internal.Match_Iterator.Match_Iterator_Value
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Data.Vector.Vector
@ -23,7 +22,7 @@ type Replacer
Implements a replacement for a regular expression.
Pattern.replace uses a Replacer to replace each regex match with
Regex.replace uses a Replacer to replace each regex match with
a replacement string. This string can contain references to match
groups from the original regex.
@ -41,7 +40,7 @@ type Replacer
Arguments
- replacement_string: a string, possibly containing group references,
that will be used to provide a replacement in a regex match.
new : Text -> Pattern -> Replacer ! No_Such_Group
new : Text -> Regex -> Replacer ! No_Such_Group
new replacement_string pattern =
Replacer.Value (build_replacement_vector_cached replacement_string pattern)
@ -84,7 +83,7 @@ group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))"
Uses Replacement_Cache to avoid rebuilding the vector for recently used
replacement strings.
build_replacement_vector_cached : Text -> Pattern -> Vector Replacement ! No_Such_Group
build_replacement_vector_cached : Text -> Regex -> Vector Replacement ! No_Such_Group
build_replacement_vector_cached replacement_string pattern =
Replacer_Cache.get_or_set replacement_string _->
build_replacement_vector replacement_string pattern
@ -95,7 +94,7 @@ build_replacement_vector_cached replacement_string pattern =
Parse the replacement string into an alternating series of literal
strings and group reference numbers.
build_replacement_vector : Text -> Pattern -> Vector Replacement ! No_Such_Group
build_replacement_vector : Text -> Regex -> Vector Replacement ! No_Such_Group
build_replacement_vector replacement_string pattern =
replacement_pattern = Regex.compile group_reference_regex
it = replacement_pattern.iterator replacement_string
@ -119,14 +118,14 @@ build_replacement_vector replacement_string pattern =
Parse a capture group reference.
Arguments:
- pattern: the Pattern used to initiate the replacement. This is used
- pattern: the Regex used to initiate the replacement. This is used
to identify and validate capture groups.
- match: the match of the replacement string against group_reference_regex.
Returns a Replacement: a group number, or, in the case of `$$`, a literal.
See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
parse_group_number : Pattern -> Match -> Replacement ! No_Such_Group
parse_group_number : Regex -> Match -> Replacement ! No_Such_Group
parse_group_number pattern match = case match.text.take 2 of
"$$" -> Replacement.Literal "$"
"$<" ->

View File

@ -4,7 +4,7 @@ import project.Data.Numbers.Integer
from project.Data.Range.Extensions import all
import project.Data.Range.Range
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Regex.Pattern.Pattern
import project.Data.Text.Regex.Regex
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
@ -21,7 +21,7 @@ type Match
## PRIVATE
internal_regex_result : RegexResult (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (pattern : Pattern) (internal_regex_result : Any) (input : Text)
Value (pattern : Regex) (internal_regex_result : Any) (input : Text)
## PRIVATE
Returns the start UTF16 character index of a group.
@ -121,7 +121,7 @@ type Match
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
(Regex.lookup_group 3) will return 3. If the caller tries to get group 3,
Match.utf_16_span will return the default value.
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
utf_16_span self group=0 ~default=Nothing =
@ -159,7 +159,7 @@ type Match
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get
(Regex.lookup_group 3) will return 3. If the caller tries to get
group 3, Match.span will return the default value.
span : Integer | Text -> Any -> Span ! No_Such_Group
span self group=0 ~default=Nothing =
@ -187,7 +187,7 @@ type Match
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get
(Regex.lookup_group 3) will return 3. If the caller tries to get
group 3, Match.text will return the default value.
text : Integer | Text -> Any -> Text ! No_Such_Group
text self group=0 ~default=Nothing =
@ -217,7 +217,7 @@ type Match
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. `groups` will return the
(Regex.lookup_group 3) will return 3. `groups` will return the
default value for groups that do not participate.
> Example
@ -249,7 +249,7 @@ type Match
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. `named_groups` will map
(Regex.lookup_group 3) will return 3. `named_groups` will map
a named group that does not participate to the default value.
> Example

View File

@ -1,427 +0,0 @@
import project.Any.Any
import project.Data.Filter_Condition.Filter_Condition
import project.Data.Map.Map
import project.Data.Numbers.Integer
from project.Data.Range.Extensions import all
import project.Data.Range.Range
from project.Data.Text.Extensions import all
import project.Data.Text.Helpers
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.No_Such_Group
import project.Data.Text.Regex.Replacer.Replacer
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Errors.Common.Type_Error
import project.Error.Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Math
import project.Meta
import project.Nothing.Nothing
import project.Polyglot.Polyglot
from project.Data.Boolean import Boolean, True, False
from project.Data.Index_Sub_Range import sort_and_merge_ranges
polyglot java import org.enso.base.Replacer_Cache
polyglot java import org.enso.base.Text_Utils
type Pattern
## internal_regex_object : RegexObject (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (internal_regex_object : Any)
## Returns `True` if the input matches against the pattern described by
`self`, otherwise `False`.
Arguments:
- input: The text to check for matching.
matches : Text -> Boolean | Type_Error
matches self input =
Helpers.expect_text input <|
m = self.internal_regex_object.exec input 0
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
## Tries to match the provided `input` against the pattern `self`.
Returns a `Match` containing the matched text and its match groups, or
`Nothing` if the match failed.
Arguments:
- input: The text to match the pattern described by `self` against.
match : Text -> Match | Nothing | Type_Error
match self input =
Helpers.expect_text input <|
it = Match_Iterator.new self input
case it.next of
Match_Iterator_Value.Next _ match _ -> match
Match_Iterator_Value.Last _ -> Nothing
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Match` object, each containing the matched text
and its match groups.
Arguments:
- input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match ! Type_Error | Illegal_Argument
match_all self input =
Helpers.expect_text input <|
pattern_is_empty = self.internal_regex_object.pattern == ''
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
builder = Vector.new_builder
it = Match_Iterator.new self input
go it = case it.next of
Match_Iterator_Value.Next _ match next_it ->
builder.append match
@Tail_Call go next_it
Match_Iterator_Value.Last _ -> Nothing
go it
builder.to_vector
## Tries to match the provided `input` against the pattern `self`.
Returns a `Text` containing the matched text, or `Nothing` if the match
failed.
Arguments:
- input: The text to match the pattern described by `self` against.
find : Text -> Text | Nothing | Type_Error
find self input =
Helpers.expect_text input <|
match_to_group_maybe <| self.match input
## Tries to match the provided `input` against the pattern `self`.
Returns a `Vector Text`, each containing the matched text.
If the pattern does not match, an empty `Vector` is returned.
Arguments:
- input: The text to match the pattern described by `self` against.
find_all : Text -> Vector Text ! Type_Error
find_all self input =
Helpers.expect_text input <|
self.match_all input . map match_to_group_maybe
## Splits the `input` text based on the pattern described by `self`.
Arguments:
- input: The text to split based on the pattern described by `self`.
- only_first: If true, only split at the first occurrence.
This method will _always_ return a vector. If no splits take place, the
vector will contain a single element (equal to the original string).
> Example
Split on the first instance of the pattern.
pattern = Regex.compile "cd"
input = "abcdefcdghij"
texts = pattern.split input only_first=True
texts . should_equal ["ab", "efcdghij"]
> Example
Split on the all instances of the pattern in the input.
pattern = Regex.compile "a"
input = "bacadaeaf"
texts = pattern.split input
texts . should_equal ["b", "c", "d", "e", "f"]
> Example
Returns the original text if there are no matches.
pattern = Regex.compile "aa"
input = "abcdefghij"
texts = pattern.split input
texts . should_equal ["abcdefghij"]
split : Text -> Boolean -> Vector Text | Type_Error
split self input only_first=False =
Helpers.expect_text input <|
builder = Vector.new_builder
it = Match_Iterator.new self input
go next = case next of
Match_Iterator_Value.Next filler _ next_it ->
builder.append filler.text
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next
Match_Iterator_Value.Last filler ->
builder.append filler.text
go it.next
builder.to_vector
## Takes an input string and returns all the matches as a `Vector Text`.
If the pattern contains marked groups, the values are concatenated
together; otherwise the whole match is returned. Non-participating
groups are omitted.
Arguments:
- input: The text to tokenize.
> Example
Split to blocks of 3 characters.
Regex.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
> Example
Split to blocks of 3 characters taking first and third letters.
Regex.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
> Example
Split a text on any white space.
Regex.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
== ['Hello','Big','Wide','World','Goodbye!']
tokenize : Text -> Vector Text
tokenize self input =
self.match_all input . map (build_tokenization_output_from_match self _)
## Replace all occurrences of the pattern described by `self` in the `input`
with the specified `replacement`.
Arguments:
- input: The text in which to perform the replacement(s).
- replacement: The literal text with which to replace any matches.
- only_first: If True, only replace the first match.
If this method performs no replacements it will return the `input` text
unchanged.
The replacement string can contain references to groups matched by the
regex. The following syntaxes are supported:
$0: the entire match string
$&: the entire match string
$n: the nth group
$<foo>: Named group `foo`
> Example
Replace letters in the text "aa".
pattern = Regex.compile 'aa'
pattern.replace 'aaa' 'b' == 'ba'
> Example
Replace all occurrences of letters 'l' and 'o' with '#'.
pattern = Regex.compile '[lo]'
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
> Example
Replace the first occurrence of letter 'l' with '#'.
pattern = Regex.compile 'l'
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
> Example
Replace texts in quotes with parentheses.
pattern = Regex.compile '"(.*?)"'
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
> Example
Replace a literal string with a replacement value.
pattern = Regex.compile "aa"
input = "aa ab aa ac ad aa aa ax"
match = pattern.replace input "xyz"
match == "xyz ab xyz ac ad xyz xyz ax"
> Example
Replace each word with the same word surrounded by `[]`.
pattern = Regex.compile "([a-z]+)"
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
replace : Text -> Text -> Boolean -> Text | Type_Error
replace self input replacement only_first=False =
Helpers.expect_text input <|
it = Match_Iterator.new self input
case it of
Match_Iterator_Value.Last filler -> filler.text
_ ->
replacer = Replacer.new replacement self
replacer.if_not_error <|
go next current = case next of
Match_Iterator_Value.Next filler match next_it ->
new_value = current + filler.text + (replacer.replace match)
next = if only_first then next_it.early_exit else next_it.next
@Tail_Call go next new_value
Match_Iterator_Value.Last filler ->
current + filler.text
go it.next ""
## PRIVATE
Look up a match group name or number, and check that it is valid.
Arguments:
- id: The name or number of the group that was asked for.
Returns: a group number.
A group number is invalid if it is outside the range of groups
that were in the original pattern.
A group name is invalid if it was not defined in the original pattern.
A group name is an alias for a group number; if a name is passed to
this method, it returns the corresponding group number.
If a group number is passed to `lookup_group` and it is valid, it will
simply return the group number.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
Match.group will return Nothing.
lookup_group : Integer | Text -> Integer ! No_Such_Group
lookup_group self id =
case id of
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
True -> n
False -> Error.throw (No_Such_Group.Error n)
name : Text ->
# Maps name to number
groups = self.internal_regex_object.groups
n = case groups of
# If Nothing, there are no named groups
Nothing -> Error.throw (No_Such_Group.Error name)
_ -> read_group_map groups name
case n of
_ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name)
## PRIVATE
Return a lazy iterator over matches against a string.
Arguments
- text: the string to match against.
iterator : Text -> Match_Iterator
iterator self input = Match_Iterator.new self input
## Return the number of groups in the underlying RegexObject.
Note, the count includes group 0 (the whole match) as well.
group_count : Integer
group_count self = self.internal_regex_object.groupCount
## Return a vector of all named group names.
named_groups : Vector Text
named_groups self =
map = polyglot_map_to_map self.internal_regex_object.groups
map.keys
## Return a map from group number to group name. Only includes named groups.
group_nums_to_names : Map Integer Text
group_nums_to_names self =
map = polyglot_map_to_map self.internal_regex_object.groups
map.transform k-> v-> [v, k]
## PRIVATE
Performs the regex match, and iterates through the results. Yields both
the matched parts of the string, and the 'filler' parts between them.
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
matches and replacement boundaries can fall in the middle of multi-
character graphemes, thereby splitting them apart.
At each step, it yields a Match_Iterator_Value, whivch has either a filler
and a match, or just the final filler. A Match_Iterator_Value.Last value is
return at the end, and only at the end.
Optionally, you can call `early_exit` to have it return the remainder of
the string, unmatched, as a single Last value. (Used for `replace` with
`only_first=True`.)
type Match_Iterator
## PRIVATE
new : Pattern -> Text -> Match_Iterator
new pattern input = Match_Iterator.Value pattern input 0
## PRIVATE
Value (pattern : Pattern) (input : Text) (cursor : Integer)
## PRIVATE
Return the next match, or the last filler string if there is no
additional match.
Also returns the next iterator, if there was a match.
next : Match_Iterator_Value
next self =
regex_result = if self.cursor > self.input.char_vector.length then Nothing else self.pattern.internal_regex_object.exec self.input self.cursor
case regex_result.is_nothing.not && regex_result.isMatch of
False ->
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = (Utf_16_Span.Value filler_range self.input)
Match_Iterator_Value.Last filler_span
True ->
match_start = regex_result.getStart 0
filler_range = Range.new self.cursor match_start
filler_span = (Utf_16_Span.Value filler_range self.input)
match = Match.Value self.pattern regex_result self.input
## Handle edge case where match is 0 length
next_cursor = (self.cursor + 1).max (match.utf_16_end 0)
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
Match_Iterator_Value.Next filler_span match next_iterator
## PRIVATE
Returns the remainder of the string, unmatched.
early_exit : Match_Iterator_Value
early_exit self =
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = Utf_16_Span.Value filler_range self.input
Match_Iterator_Value.Last filler_span
## PRIVATE
type Match_Iterator_Value
## PRIVATE
Next (filler : Utf_16_Span) (match : Match) (next_iterator : Match_Iterator)
## PRIVATE
Last (filler : Utf_16_Span)
## PRIVATE
Convert the polyglot map to a Map.
polyglot_map_to_map : Any -> Map Any Any
polyglot_map_to_map map =
polyglot_keys = Polyglot.get_members map
keys = Vector.from_polyglot_array polyglot_keys
pairs = keys.map key-> [key, Polyglot.get_member map key]
Map.from_vector pairs
## PRIVATE
Get the named group from the polyglot map.
read_group_map : Any -> Text -> Integer | Nothing
read_group_map polyglot_map name =
map = polyglot_map_to_map polyglot_map
map.get name
## PRIVATE
match_to_group_maybe : Match | Nothing -> Text | Nothing
match_to_group_maybe match =
if match.is_nothing then Nothing else match.text 0
## PRIVATE
Build an output string from a Match resulting from `tokenize`.
See `tokenize`.
build_tokenization_output_from_match : Pattern -> Match -> Text
build_tokenization_output_from_match pattern match =
if pattern.group_count == 1 then match.text 0 else
# Extract the ranges of the spans of all capturing groups
group_numbers = 1.up_to pattern.group_count
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
ranges = spans.map span-> case span of Span.Value range _ -> range
# Eliminate nested capturing groups by sorting and merging the ranges.
top_level_ranges = sort_and_merge_ranges ranges
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
text_all = case spans.at 0 of Span.Value _ text -> text
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
top_level_spans.map (.text) . join

View File

@ -100,7 +100,7 @@ import project.Data.Text.Encoding.Encoding
import project.Data.Text.Line_Ending_Style.Line_Ending_Style
import project.Data.Text.Location.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex
import project.Data.Text.Regex.Regex
import project.Data.Text.Text_Ordering.Text_Ordering
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
import project.Data.Time.Date.Date
@ -152,7 +152,7 @@ export project.Data.Text.Encoding.Encoding
export project.Data.Text.Line_Ending_Style.Line_Ending_Style
export project.Data.Text.Location.Location
export project.Data.Text.Matching_Mode.Matching_Mode
export project.Data.Text.Regex
export project.Data.Text.Regex.Regex
export project.Data.Text.Text_Ordering.Text_Ordering
export project.Data.Text.Text_Sub_Range.Text_Sub_Range
export project.Data.Time.Date.Date

View File

@ -5,7 +5,6 @@ import project.Data.Table.Table
import project.Data.Type.Value_Type.Value_Type
import project.Internal.Problem_Builder.Problem_Builder
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
import Standard.Base.Data.Text.Regex.Pattern.Pattern
from project.Errors import Column_Count_Exceeded, Column_Count_Mismatch, Duplicate_Output_Column_Names, Invalid_Value_Type, Missing_Input_Columns
from project.Internal.Java_Exports import make_string_builder
@ -72,7 +71,7 @@ parse_to_columns table input_column_id pattern="." case_sensitivity=Case_Sensiti
Create a parser from a regex to a nested `Vector`. Each match becomes an
element of the vector; each group (or the whole match, if there are no
groups) becomes an element of the inner vectors.
regex_parse_to_vectors : Pattern -> (Text -> Vector (Vector (Text | Nothing)))
regex_parse_to_vectors : Regex -> (Text -> Vector (Vector (Text | Nothing)))
regex_parse_to_vectors pattern =
input->
matches = pattern.match_all input
@ -85,7 +84,7 @@ regex_parse_to_vectors pattern =
If the regex has no explicit groups, it uses the original column name
unchanged; otherwise, it uses the group name if it exists, or the original
column name with a number.
regex_to_column_names : Pattern -> Text -> Vector Text
regex_to_column_names : Regex -> Text -> Vector Text
regex_to_column_names pattern original_column_name =
case pattern.group_count of
1 ->

View File

@ -3,13 +3,13 @@ import Standard.Base.Data.Text.Span.Span
import Standard.Base.Data.Text.Span.Utf_16_Span
import Standard.Base.Data.Text.Regex.Match.Match
import Standard.Base.Data.Text.Regex.No_Such_Group
import Standard.Base.Data.Text.Regex.Pattern.Pattern
import Standard.Base.Data.Text.Regex.Regex
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
import Standard.Base.Data.Text.Regex.Replacer.Replacer
import Standard.Base.Data.Text.Regex.Internal.Replacer.Replacer
import Standard.Base.Errors.Common.Type_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
from Standard.Base.Data.Text.Regex.Internal.Replacer import get_lru_size, replacer_cache_lookup
from Standard.Test import Test, Test_Suite
import Standard.Test.Extensions
@ -20,7 +20,7 @@ spec =
Test.group "Compile" <|
Test.specify "should be able to be compiled" <|
pattern = Regex.compile "(?<dots>..)" case_insensitive=True
pattern . should_be_a Pattern
pattern . should_be_a Regex
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
Regex.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error