mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 19:21:54 +03:00
parent
72a6c54dfc
commit
966f8b773a
@ -1,7 +1,7 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
from project.Data.Text.Extensions import all
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Regex
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Error.Error
|
||||
|
@ -1,5 +1,5 @@
|
||||
import project.Data.Locale.Locale
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Regex
|
||||
import project.Data.Text.Text
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
@ -9,7 +9,7 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Text.Location.Location
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Regex
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.Regex_Syntax_Error
|
||||
import project.Data.Text.Span.Span
|
||||
|
@ -1,59 +1,411 @@
|
||||
import project.Any.Any
|
||||
import project.Errors.Common.Syntax_Error
|
||||
import project.Data.Filter_Condition.Filter_Condition
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Helpers
|
||||
import project.Data.Text.Prim_Text_Helper
|
||||
import project.Data.Text.Regex.Pattern.Pattern
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Regex.Internal.Match_Iterator.Match_Iterator
|
||||
import project.Data.Text.Regex.Internal.Match_Iterator.Match_Iterator_Value
|
||||
import project.Data.Text.Regex.Internal.Replacer.Replacer
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Common.Type_Error
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Math
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
import project.Polyglot.Polyglot
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Errors.Common import Syntax_Error
|
||||
from project.Data.Index_Sub_Range import sort_and_merge_ranges
|
||||
from project.Data.Range.Extensions import all
|
||||
from project.Data.Text.Extensions import all
|
||||
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
polyglot java import org.enso.base.Regex_Utils
|
||||
|
||||
## Compile the provided `expression` into a regex pattern that can be used for
|
||||
matching.
|
||||
type Regex
|
||||
## Compile the provided `expression` into a `Regex` that can be used for
|
||||
matching.
|
||||
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want to
|
||||
compile. Must be non-empty.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want to
|
||||
compile. Must be non-empty.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
|
||||
If an empty regex is used, `compile` throws an Illegal_Argument error.
|
||||
If an empty regex is used, `compile` throws an Illegal_Argument error.
|
||||
compile : Text -> Boolean | Nothing -> Regex ! Regex_Syntax_Error | Illegal_Argument
|
||||
compile expression case_insensitive=Nothing =
|
||||
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
|
||||
options_string = if case_insensitive == True then "usgi" else "usg"
|
||||
|
||||
? Why Compile?
|
||||
While many regex engines are able to cache ad-hoc patterns, it is often
|
||||
useful to be able to manually retain a pattern that you have computed. This
|
||||
function exists so you can hold onto the resultant `Pattern` object,
|
||||
instead of immediately proceeding to match using it.
|
||||
compile : Text -> Boolean | Nothing -> Pattern ! Regex_Syntax_Error | Illegal_Argument
|
||||
compile self expression case_insensitive=Nothing =
|
||||
if expression == '' then Error.throw (Illegal_Argument.Error "Regex cannot be the empty string") else
|
||||
options_string = if case_insensitive == True then "usgi" else "usg"
|
||||
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
|
||||
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
|
||||
|
||||
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
|
||||
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
|
||||
Regex.Value internal_regex_object
|
||||
|
||||
Pattern.Value internal_regex_object
|
||||
## PRIVATE
|
||||
|
||||
## Escape the special characters in `expression` such that the result is a
|
||||
valid literal pattern for the original string.
|
||||
internal_regex_object : RegexObject (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (internal_regex_object : Any)
|
||||
|
||||
Arguments:
|
||||
- expression: The expression to escape metacharacters in.
|
||||
## Returns `True` if the input matches against the pattern described by
|
||||
`self`, otherwise `False`.
|
||||
|
||||
> Example
|
||||
Turn a Text into a regex that matches that string exactly.
|
||||
Arguments:
|
||||
- input: The text to check for matching.
|
||||
matches : Text -> Boolean | Type_Error
|
||||
matches self input =
|
||||
Helpers.expect_text input <|
|
||||
m = self.internal_regex_object.exec input 0
|
||||
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
|
||||
|
||||
example_escape =
|
||||
literal_string = "\!\.|abcde"
|
||||
Regex.escape literal_string
|
||||
escape : Text -> Text
|
||||
escape self expression = Regex_Utils.regexQuote expression
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Match` containing the matched text and its match groups, or
|
||||
`Nothing` if the match failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match : Text -> Match | Nothing | Type_Error
|
||||
match self input =
|
||||
Helpers.expect_text input <|
|
||||
it = Match_Iterator.new self input
|
||||
case it.next of
|
||||
Match_Iterator_Value.Next _ match _ -> match
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Match` object, each containing the matched text
|
||||
and its match groups.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match_all : Text -> Vector Match ! Type_Error | Illegal_Argument
|
||||
match_all self input =
|
||||
Helpers.expect_text input <|
|
||||
pattern_is_empty = self.internal_regex_object.pattern == ''
|
||||
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next _ match next_it ->
|
||||
builder.append match
|
||||
@Tail_Call go next_it
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
go it
|
||||
builder.to_vector
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Text` containing the matched text, or `Nothing` if the match
|
||||
failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find : Text -> Text | Nothing | Type_Error
|
||||
find self input =
|
||||
Helpers.expect_text input <|
|
||||
match_to_group_maybe <| self.match input
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Text`, each containing the matched text.
|
||||
If the pattern does not match, an empty `Vector` is returned.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find_all : Text -> Vector Text ! Type_Error
|
||||
find_all self input =
|
||||
Helpers.expect_text input <|
|
||||
self.match_all input . map match_to_group_maybe
|
||||
|
||||
## Splits the `input` text based on the pattern described by `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to split based on the pattern described by `self`.
|
||||
- only_first: If true, only split at the first occurrence.
|
||||
|
||||
This method will _always_ return a vector. If no splits take place, the
|
||||
vector will contain a single element (equal to the original string).
|
||||
|
||||
> Example
|
||||
Split on the first instance of the pattern.
|
||||
pattern = Regex.compile "cd"
|
||||
input = "abcdefcdghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["ab", "efcdghij"]
|
||||
|
||||
> Example
|
||||
Split on the all instances of the pattern in the input.
|
||||
pattern = Regex.compile "a"
|
||||
input = "bacadaeaf"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["b", "c", "d", "e", "f"]
|
||||
|
||||
> Example
|
||||
Returns the original text if there are no matches.
|
||||
pattern = Regex.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["abcdefghij"]
|
||||
split : Text -> Boolean -> Vector Text | Type_Error
|
||||
split self input only_first=False =
|
||||
Helpers.expect_text input <|
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go next = case next of
|
||||
Match_Iterator_Value.Next filler _ next_it ->
|
||||
builder.append filler.text
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next
|
||||
Match_Iterator_Value.Last filler ->
|
||||
builder.append filler.text
|
||||
go it.next
|
||||
builder.to_vector
|
||||
|
||||
## Takes an input string and returns all the matches as a `Vector Text`.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned. Non-participating
|
||||
groups are omitted.
|
||||
|
||||
Arguments:
|
||||
- input: The text to tokenize.
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters.
|
||||
|
||||
Regex.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters taking first and third letters.
|
||||
|
||||
Regex.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
|
||||
|
||||
> Example
|
||||
Split a text on any white space.
|
||||
|
||||
Regex.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
|
||||
== ['Hello','Big','Wide','World','Goodbye!']
|
||||
tokenize : Text -> Vector Text
|
||||
tokenize self input =
|
||||
self.match_all input . map (build_tokenization_output_from_match self _)
|
||||
|
||||
## Replace all occurrences of the pattern described by `self` in the `input`
|
||||
with the specified `replacement`.
|
||||
|
||||
Arguments:
|
||||
- input: The text in which to perform the replacement(s).
|
||||
- replacement: The literal text with which to replace any matches.
|
||||
- only_first: If True, only replace the first match.
|
||||
|
||||
If this method performs no replacements it will return the `input` text
|
||||
unchanged.
|
||||
|
||||
The replacement string can contain references to groups matched by the
|
||||
regex. The following syntaxes are supported:
|
||||
$0: the entire match string
|
||||
$&: the entire match string
|
||||
$n: the nth group
|
||||
$<foo>: Named group `foo`
|
||||
|
||||
> Example
|
||||
Replace letters in the text "aa".
|
||||
|
||||
pattern = Regex.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' == 'ba'
|
||||
|
||||
> Example
|
||||
Replace all occurrences of letters 'l' and 'o' with '#'.
|
||||
|
||||
pattern = Regex.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
|
||||
|
||||
> Example
|
||||
Replace the first occurrence of letter 'l' with '#'.
|
||||
|
||||
pattern = Regex.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
|
||||
|
||||
> Example
|
||||
Replace texts in quotes with parentheses.
|
||||
|
||||
pattern = Regex.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
|
||||
|
||||
> Example
|
||||
Replace a literal string with a replacement value.
|
||||
|
||||
pattern = Regex.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match == "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
> Example
|
||||
Replace each word with the same word surrounded by `[]`.
|
||||
|
||||
pattern = Regex.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
|
||||
replace : Text -> Text -> Boolean -> Text | Type_Error
|
||||
replace self input replacement only_first=False =
|
||||
Helpers.expect_text input <|
|
||||
it = Match_Iterator.new self input
|
||||
case it of
|
||||
Match_Iterator_Value.Last filler -> filler.text
|
||||
_ ->
|
||||
replacer = Replacer.new replacement self
|
||||
|
||||
replacer.if_not_error <|
|
||||
go next current = case next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
new_value = current + filler.text + (replacer.replace match)
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next new_value
|
||||
Match_Iterator_Value.Last filler ->
|
||||
current + filler.text
|
||||
go it.next ""
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Look up a match group name or number, and check that it is valid.
|
||||
|
||||
Arguments:
|
||||
- id: The name or number of the group that was asked for.
|
||||
|
||||
Returns: a group number.
|
||||
|
||||
A group number is invalid if it is outside the range of groups
|
||||
that were in the original pattern.
|
||||
|
||||
A group name is invalid if it was not defined in the original pattern.
|
||||
|
||||
A group name is an alias for a group number; if a name is passed to
|
||||
this method, it returns the corresponding group number.
|
||||
|
||||
If a group number is passed to `lookup_group` and it is valid, it will
|
||||
simply return the group number.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Regex.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match.group will return Nothing.
|
||||
|
||||
lookup_group : Integer | Text -> Integer ! No_Such_Group
|
||||
lookup_group self id =
|
||||
case id of
|
||||
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
|
||||
True -> n
|
||||
False -> Error.throw (No_Such_Group.Error n)
|
||||
name : Text ->
|
||||
# Maps name to number
|
||||
groups = self.internal_regex_object.groups
|
||||
|
||||
n = case groups of
|
||||
# If Nothing, there are no named groups
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
_ -> read_group_map groups name
|
||||
case n of
|
||||
_ : Integer -> n
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Return a lazy iterator over matches against a string.
|
||||
|
||||
Arguments
|
||||
- text: the string to match against.
|
||||
iterator : Text -> Match_Iterator
|
||||
iterator self input = Match_Iterator.new self input
|
||||
|
||||
## Return the number of groups in the underlying RegexObject.
|
||||
Note, the count includes group 0 (the whole match) as well.
|
||||
group_count : Integer
|
||||
group_count self = self.internal_regex_object.groupCount
|
||||
|
||||
## Return a vector of all named group names.
|
||||
named_groups : Vector Text
|
||||
named_groups self =
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.keys
|
||||
|
||||
## Return a map from group number to group name. Only includes named groups.
|
||||
group_nums_to_names : Map Integer Text
|
||||
group_nums_to_names self =
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.transform k-> v-> [v, k]
|
||||
|
||||
## Escape the special characters in `expression` such that the result is a
|
||||
valid literal pattern for the original string.
|
||||
|
||||
Arguments:
|
||||
- expression: The expression to escape metacharacters in.
|
||||
|
||||
> Example
|
||||
Turn a Text into a regex that matches that string exactly.
|
||||
|
||||
example_escape =
|
||||
literal_string = "\!\.|abcde"
|
||||
Regex.escape literal_string
|
||||
escape : Text -> Text
|
||||
escape expression = Regex_Utils.regexQuote expression
|
||||
|
||||
## PRIVATE
|
||||
Convert the polyglot map to a Map.
|
||||
polyglot_map_to_map : Any -> Map Any Any
|
||||
polyglot_map_to_map map =
|
||||
polyglot_keys = Polyglot.get_members map
|
||||
keys = Vector.from_polyglot_array polyglot_keys
|
||||
pairs = keys.map key-> [key, Polyglot.get_member map key]
|
||||
Map.from_vector pairs
|
||||
|
||||
## PRIVATE
|
||||
Get the named group from the polyglot map.
|
||||
read_group_map : Any -> Text -> Integer | Nothing
|
||||
read_group_map polyglot_map name =
|
||||
map = polyglot_map_to_map polyglot_map
|
||||
map.get name
|
||||
|
||||
## PRIVATE
|
||||
match_to_group_maybe : Match | Nothing -> Text | Nothing
|
||||
match_to_group_maybe match =
|
||||
if match.is_nothing then Nothing else match.text 0
|
||||
|
||||
## PRIVATE
|
||||
Build an output string from a Match resulting from `tokenize`.
|
||||
See `tokenize`.
|
||||
build_tokenization_output_from_match : Regex -> Match -> Text
|
||||
build_tokenization_output_from_match pattern match =
|
||||
if pattern.group_count == 1 then match.text 0 else
|
||||
# Extract the ranges of the spans of all capturing groups
|
||||
group_numbers = 1.up_to pattern.group_count
|
||||
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
|
||||
ranges = spans.map span-> case span of Span.Value range _ -> range
|
||||
|
||||
# Eliminate nested capturing groups by sorting and merging the ranges.
|
||||
top_level_ranges = sort_and_merge_ranges ranges
|
||||
|
||||
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
|
||||
text_all = case spans.at 0 of Span.Value _ text -> text
|
||||
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
|
||||
top_level_spans.map (.text) . join
|
||||
|
||||
## An error that is emitted when there is no such group in the match for the
|
||||
provided `id`.
|
||||
|
@ -0,0 +1,84 @@
|
||||
import project.Errors.Common.Syntax_Error
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Prim_Text_Helper
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.Regex
|
||||
import project.Data.Text.Regex.Regex_Syntax_Error
|
||||
import project.Data.Text.Text
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Data.Range.Extensions import all
|
||||
from project.Data.Text.Extensions import all
|
||||
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
polyglot java import org.enso.base.Regex_Utils
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Performs the regex match, and iterates through the results. Yields both
|
||||
the matched parts of the string, and the 'filler' parts between them.
|
||||
|
||||
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
|
||||
matches and replacement boundaries can fall in the middle of multi-
|
||||
character graphemes, thereby splitting them apart.
|
||||
|
||||
At each step, it yields a Match_Iterator_Value, whivch has either a filler
|
||||
and a match, or just the final filler. A Match_Iterator_Value.Last value is
|
||||
return at the end, and only at the end.
|
||||
|
||||
Optionally, you can call `early_exit` to have it return the remainder of
|
||||
the string, unmatched, as a single Last value. (Used for `replace` with
|
||||
`only_first=True`.)
|
||||
type Match_Iterator
|
||||
## PRIVATE
|
||||
new : Regex -> Text -> Match_Iterator
|
||||
new pattern input = Match_Iterator.Value pattern input 0
|
||||
|
||||
## PRIVATE
|
||||
Value (pattern : Regex) (input : Text) (cursor : Integer)
|
||||
|
||||
## PRIVATE
|
||||
Return the next match, or the last filler string if there is no
|
||||
additional match.
|
||||
|
||||
Also returns the next iterator, if there was a match.
|
||||
next : Match_Iterator_Value
|
||||
next self =
|
||||
regex_result = if self.cursor > self.input.char_vector.length then Nothing else self.pattern.internal_regex_object.exec self.input self.cursor
|
||||
case regex_result.is_nothing.not && regex_result.isMatch of
|
||||
False ->
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
Match_Iterator_Value.Last filler_span
|
||||
True ->
|
||||
match_start = regex_result.getStart 0
|
||||
filler_range = Range.new self.cursor match_start
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
match = Match.Value self.pattern regex_result self.input
|
||||
## Handle edge case where match is 0 length
|
||||
next_cursor = (self.cursor + 1).max (match.utf_16_end 0)
|
||||
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
|
||||
Match_Iterator_Value.Next filler_span match next_iterator
|
||||
|
||||
## PRIVATE
|
||||
Returns the remainder of the string, unmatched.
|
||||
early_exit : Match_Iterator_Value
|
||||
early_exit self =
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = Utf_16_Span.Value filler_range self.input
|
||||
Match_Iterator_Value.Last filler_span
|
||||
|
||||
## PRIVATE
|
||||
type Match_Iterator_Value
|
||||
## PRIVATE
|
||||
Next (filler : Utf_16_Span) (match : Match) (next_iterator : Match_Iterator)
|
||||
|
||||
## PRIVATE
|
||||
Last (filler : Utf_16_Span)
|
||||
|
@ -1,10 +1,9 @@
|
||||
import project.Data.Numbers.Integer
|
||||
from project.Data.Text.Extensions import all
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Regex
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Regex.Pattern.Match_Iterator_Value
|
||||
import project.Data.Text.Regex.Pattern.Pattern
|
||||
import project.Data.Text.Regex.Internal.Match_Iterator.Match_Iterator_Value
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
@ -23,7 +22,7 @@ type Replacer
|
||||
|
||||
Implements a replacement for a regular expression.
|
||||
|
||||
Pattern.replace uses a Replacer to replace each regex match with
|
||||
Regex.replace uses a Replacer to replace each regex match with
|
||||
a replacement string. This string can contain references to match
|
||||
groups from the original regex.
|
||||
|
||||
@ -41,7 +40,7 @@ type Replacer
|
||||
Arguments
|
||||
- replacement_string: a string, possibly containing group references,
|
||||
that will be used to provide a replacement in a regex match.
|
||||
new : Text -> Pattern -> Replacer ! No_Such_Group
|
||||
new : Text -> Regex -> Replacer ! No_Such_Group
|
||||
new replacement_string pattern =
|
||||
Replacer.Value (build_replacement_vector_cached replacement_string pattern)
|
||||
|
||||
@ -84,7 +83,7 @@ group_reference_regex = "\$(([0-9]+)|(\$)|(&)|(<([^>]+)>))"
|
||||
|
||||
Uses Replacement_Cache to avoid rebuilding the vector for recently used
|
||||
replacement strings.
|
||||
build_replacement_vector_cached : Text -> Pattern -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector_cached : Text -> Regex -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector_cached replacement_string pattern =
|
||||
Replacer_Cache.get_or_set replacement_string _->
|
||||
build_replacement_vector replacement_string pattern
|
||||
@ -95,7 +94,7 @@ build_replacement_vector_cached replacement_string pattern =
|
||||
|
||||
Parse the replacement string into an alternating series of literal
|
||||
strings and group reference numbers.
|
||||
build_replacement_vector : Text -> Pattern -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector : Text -> Regex -> Vector Replacement ! No_Such_Group
|
||||
build_replacement_vector replacement_string pattern =
|
||||
replacement_pattern = Regex.compile group_reference_regex
|
||||
it = replacement_pattern.iterator replacement_string
|
||||
@ -119,14 +118,14 @@ build_replacement_vector replacement_string pattern =
|
||||
Parse a capture group reference.
|
||||
|
||||
Arguments:
|
||||
- pattern: the Pattern used to initiate the replacement. This is used
|
||||
- pattern: the Regex used to initiate the replacement. This is used
|
||||
to identify and validate capture groups.
|
||||
- match: the match of the replacement string against group_reference_regex.
|
||||
|
||||
Returns a Replacement: a group number, or, in the case of `$$`, a literal.
|
||||
|
||||
See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions
|
||||
parse_group_number : Pattern -> Match -> Replacement ! No_Such_Group
|
||||
parse_group_number : Regex -> Match -> Replacement ! No_Such_Group
|
||||
parse_group_number pattern match = case match.text.take 2 of
|
||||
"$$" -> Replacement.Literal "$"
|
||||
"$<" ->
|
@ -4,7 +4,7 @@ import project.Data.Numbers.Integer
|
||||
from project.Data.Range.Extensions import all
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Regex.Pattern.Pattern
|
||||
import project.Data.Text.Regex.Regex
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
@ -21,7 +21,7 @@ type Match
|
||||
## PRIVATE
|
||||
internal_regex_result : RegexResult (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (pattern : Pattern) (internal_regex_result : Any) (input : Text)
|
||||
Value (pattern : Regex) (internal_regex_result : Any) (input : Text)
|
||||
|
||||
## PRIVATE
|
||||
Returns the start UTF16 character index of a group.
|
||||
@ -121,7 +121,7 @@ type Match
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
(Regex.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match.utf_16_span will return the default value.
|
||||
utf_16_span : Integer | Text -> Any -> Utf_16_Span ! No_Such_Group
|
||||
utf_16_span self group=0 ~default=Nothing =
|
||||
@ -159,7 +159,7 @@ type Match
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get
|
||||
(Regex.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match.span will return the default value.
|
||||
span : Integer | Text -> Any -> Span ! No_Such_Group
|
||||
span self group=0 ~default=Nothing =
|
||||
@ -187,7 +187,7 @@ type Match
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get
|
||||
(Regex.lookup_group 3) will return 3. If the caller tries to get
|
||||
group 3, Match.text will return the default value.
|
||||
text : Integer | Text -> Any -> Text ! No_Such_Group
|
||||
text self group=0 ~default=Nothing =
|
||||
@ -217,7 +217,7 @@ type Match
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. `groups` will return the
|
||||
(Regex.lookup_group 3) will return 3. `groups` will return the
|
||||
default value for groups that do not participate.
|
||||
|
||||
> Example
|
||||
@ -249,7 +249,7 @@ type Match
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. `named_groups` will map
|
||||
(Regex.lookup_group 3) will return 3. `named_groups` will map
|
||||
a named group that does not participate to the default value.
|
||||
|
||||
> Example
|
||||
|
@ -1,427 +0,0 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Filter_Condition.Filter_Condition
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
from project.Data.Range.Extensions import all
|
||||
import project.Data.Range.Range
|
||||
from project.Data.Text.Extensions import all
|
||||
import project.Data.Text.Helpers
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.No_Such_Group
|
||||
import project.Data.Text.Regex.Replacer.Replacer
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Errors.Common.Type_Error
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
import project.Math
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Polyglot.Polyglot
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Data.Index_Sub_Range import sort_and_merge_ranges
|
||||
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
|
||||
type Pattern
|
||||
## internal_regex_object : RegexObject (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (internal_regex_object : Any)
|
||||
|
||||
## Returns `True` if the input matches against the pattern described by
|
||||
`self`, otherwise `False`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to check for matching.
|
||||
matches : Text -> Boolean | Type_Error
|
||||
matches self input =
|
||||
Helpers.expect_text input <|
|
||||
m = self.internal_regex_object.exec input 0
|
||||
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Match` containing the matched text and its match groups, or
|
||||
`Nothing` if the match failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match : Text -> Match | Nothing | Type_Error
|
||||
match self input =
|
||||
Helpers.expect_text input <|
|
||||
it = Match_Iterator.new self input
|
||||
case it.next of
|
||||
Match_Iterator_Value.Next _ match _ -> match
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Match` object, each containing the matched text
|
||||
and its match groups.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match_all : Text -> Vector Match ! Type_Error | Illegal_Argument
|
||||
match_all self input =
|
||||
Helpers.expect_text input <|
|
||||
pattern_is_empty = self.internal_regex_object.pattern == ''
|
||||
if pattern_is_empty then Error.throw (Illegal_Argument.Error "Cannot run match_all with an empty pattern") else
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next _ match next_it ->
|
||||
builder.append match
|
||||
@Tail_Call go next_it
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
go it
|
||||
builder.to_vector
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Text` containing the matched text, or `Nothing` if the match
|
||||
failed.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find : Text -> Text | Nothing | Type_Error
|
||||
find self input =
|
||||
Helpers.expect_text input <|
|
||||
match_to_group_maybe <| self.match input
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a `Vector Text`, each containing the matched text.
|
||||
If the pattern does not match, an empty `Vector` is returned.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
find_all : Text -> Vector Text ! Type_Error
|
||||
find_all self input =
|
||||
Helpers.expect_text input <|
|
||||
self.match_all input . map match_to_group_maybe
|
||||
|
||||
## Splits the `input` text based on the pattern described by `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to split based on the pattern described by `self`.
|
||||
- only_first: If true, only split at the first occurrence.
|
||||
|
||||
This method will _always_ return a vector. If no splits take place, the
|
||||
vector will contain a single element (equal to the original string).
|
||||
|
||||
> Example
|
||||
Split on the first instance of the pattern.
|
||||
pattern = Regex.compile "cd"
|
||||
input = "abcdefcdghij"
|
||||
texts = pattern.split input only_first=True
|
||||
texts . should_equal ["ab", "efcdghij"]
|
||||
|
||||
> Example
|
||||
Split on the all instances of the pattern in the input.
|
||||
pattern = Regex.compile "a"
|
||||
input = "bacadaeaf"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["b", "c", "d", "e", "f"]
|
||||
|
||||
> Example
|
||||
Returns the original text if there are no matches.
|
||||
pattern = Regex.compile "aa"
|
||||
input = "abcdefghij"
|
||||
texts = pattern.split input
|
||||
texts . should_equal ["abcdefghij"]
|
||||
split : Text -> Boolean -> Vector Text | Type_Error
|
||||
split self input only_first=False =
|
||||
Helpers.expect_text input <|
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go next = case next of
|
||||
Match_Iterator_Value.Next filler _ next_it ->
|
||||
builder.append filler.text
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next
|
||||
Match_Iterator_Value.Last filler ->
|
||||
builder.append filler.text
|
||||
go it.next
|
||||
builder.to_vector
|
||||
|
||||
## Takes an input string and returns all the matches as a `Vector Text`.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned. Non-participating
|
||||
groups are omitted.
|
||||
|
||||
Arguments:
|
||||
- input: The text to tokenize.
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters.
|
||||
|
||||
Regex.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters taking first and third letters.
|
||||
|
||||
Regex.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
|
||||
|
||||
> Example
|
||||
Split a text on any white space.
|
||||
|
||||
Regex.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
|
||||
== ['Hello','Big','Wide','World','Goodbye!']
|
||||
tokenize : Text -> Vector Text
|
||||
tokenize self input =
|
||||
self.match_all input . map (build_tokenization_output_from_match self _)
|
||||
|
||||
## Replace all occurrences of the pattern described by `self` in the `input`
|
||||
with the specified `replacement`.
|
||||
|
||||
Arguments:
|
||||
- input: The text in which to perform the replacement(s).
|
||||
- replacement: The literal text with which to replace any matches.
|
||||
- only_first: If True, only replace the first match.
|
||||
|
||||
If this method performs no replacements it will return the `input` text
|
||||
unchanged.
|
||||
|
||||
The replacement string can contain references to groups matched by the
|
||||
regex. The following syntaxes are supported:
|
||||
$0: the entire match string
|
||||
$&: the entire match string
|
||||
$n: the nth group
|
||||
$<foo>: Named group `foo`
|
||||
|
||||
> Example
|
||||
Replace letters in the text "aa".
|
||||
|
||||
pattern = Regex.compile 'aa'
|
||||
pattern.replace 'aaa' 'b' == 'ba'
|
||||
|
||||
> Example
|
||||
Replace all occurrences of letters 'l' and 'o' with '#'.
|
||||
|
||||
pattern = Regex.compile '[lo]'
|
||||
pattern.replace 'Hello World!' '#' == 'He### W#r#d!'
|
||||
|
||||
> Example
|
||||
Replace the first occurrence of letter 'l' with '#'.
|
||||
|
||||
pattern = Regex.compile 'l'
|
||||
pattern.replace 'Hello World!' '#' only_first=True == 'He#lo World!'
|
||||
|
||||
> Example
|
||||
Replace texts in quotes with parentheses.
|
||||
|
||||
pattern = Regex.compile '"(.*?)"'
|
||||
pattern.replace '"abc" foo "bar" baz' '($1)' == '(abc) foo (bar) baz'
|
||||
|
||||
> Example
|
||||
Replace a literal string with a replacement value.
|
||||
|
||||
pattern = Regex.compile "aa"
|
||||
input = "aa ab aa ac ad aa aa ax"
|
||||
match = pattern.replace input "xyz"
|
||||
match == "xyz ab xyz ac ad xyz xyz ax"
|
||||
|
||||
> Example
|
||||
Replace each word with the same word surrounded by `[]`.
|
||||
|
||||
pattern = Regex.compile "([a-z]+)"
|
||||
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
|
||||
replace : Text -> Text -> Boolean -> Text | Type_Error
|
||||
replace self input replacement only_first=False =
|
||||
Helpers.expect_text input <|
|
||||
it = Match_Iterator.new self input
|
||||
case it of
|
||||
Match_Iterator_Value.Last filler -> filler.text
|
||||
_ ->
|
||||
replacer = Replacer.new replacement self
|
||||
|
||||
replacer.if_not_error <|
|
||||
go next current = case next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
new_value = current + filler.text + (replacer.replace match)
|
||||
next = if only_first then next_it.early_exit else next_it.next
|
||||
@Tail_Call go next new_value
|
||||
Match_Iterator_Value.Last filler ->
|
||||
current + filler.text
|
||||
go it.next ""
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Look up a match group name or number, and check that it is valid.
|
||||
|
||||
Arguments:
|
||||
- id: The name or number of the group that was asked for.
|
||||
|
||||
Returns: a group number.
|
||||
|
||||
A group number is invalid if it is outside the range of groups
|
||||
that were in the original pattern.
|
||||
|
||||
A group name is invalid if it was not defined in the original pattern.
|
||||
|
||||
A group name is an alias for a group number; if a name is passed to
|
||||
this method, it returns the corresponding group number.
|
||||
|
||||
If a group number is passed to `lookup_group` and it is valid, it will
|
||||
simply return the group number.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match.group will return Nothing.
|
||||
|
||||
lookup_group : Integer | Text -> Integer ! No_Such_Group
|
||||
lookup_group self id =
|
||||
case id of
|
||||
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
|
||||
True -> n
|
||||
False -> Error.throw (No_Such_Group.Error n)
|
||||
name : Text ->
|
||||
# Maps name to number
|
||||
groups = self.internal_regex_object.groups
|
||||
|
||||
n = case groups of
|
||||
# If Nothing, there are no named groups
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
_ -> read_group_map groups name
|
||||
case n of
|
||||
_ : Integer -> n
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Return a lazy iterator over matches against a string.
|
||||
|
||||
Arguments
|
||||
- text: the string to match against.
|
||||
iterator : Text -> Match_Iterator
|
||||
iterator self input = Match_Iterator.new self input
|
||||
|
||||
## Return the number of groups in the underlying RegexObject.
|
||||
Note, the count includes group 0 (the whole match) as well.
|
||||
group_count : Integer
|
||||
group_count self = self.internal_regex_object.groupCount
|
||||
|
||||
## Return a vector of all named group names.
|
||||
named_groups : Vector Text
|
||||
named_groups self =
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.keys
|
||||
|
||||
## Return a map from group number to group name. Only includes named groups.
|
||||
group_nums_to_names : Map Integer Text
|
||||
group_nums_to_names self =
|
||||
map = polyglot_map_to_map self.internal_regex_object.groups
|
||||
map.transform k-> v-> [v, k]
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Performs the regex match, and iterates through the results. Yields both
|
||||
the matched parts of the string, and the 'filler' parts between them.
|
||||
|
||||
The 'filler' elements are `Utf_16_Span`s, not `Spans`. This is because
|
||||
matches and replacement boundaries can fall in the middle of multi-
|
||||
character graphemes, thereby splitting them apart.
|
||||
|
||||
At each step, it yields a Match_Iterator_Value, whivch has either a filler
|
||||
and a match, or just the final filler. A Match_Iterator_Value.Last value is
|
||||
return at the end, and only at the end.
|
||||
|
||||
Optionally, you can call `early_exit` to have it return the remainder of
|
||||
the string, unmatched, as a single Last value. (Used for `replace` with
|
||||
`only_first=True`.)
|
||||
type Match_Iterator
|
||||
## PRIVATE
|
||||
new : Pattern -> Text -> Match_Iterator
|
||||
new pattern input = Match_Iterator.Value pattern input 0
|
||||
|
||||
## PRIVATE
|
||||
Value (pattern : Pattern) (input : Text) (cursor : Integer)
|
||||
|
||||
## PRIVATE
|
||||
Return the next match, or the last filler string if there is no
|
||||
additional match.
|
||||
|
||||
Also returns the next iterator, if there was a match.
|
||||
next : Match_Iterator_Value
|
||||
next self =
|
||||
regex_result = if self.cursor > self.input.char_vector.length then Nothing else self.pattern.internal_regex_object.exec self.input self.cursor
|
||||
case regex_result.is_nothing.not && regex_result.isMatch of
|
||||
False ->
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
Match_Iterator_Value.Last filler_span
|
||||
True ->
|
||||
match_start = regex_result.getStart 0
|
||||
filler_range = Range.new self.cursor match_start
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input)
|
||||
match = Match.Value self.pattern regex_result self.input
|
||||
## Handle edge case where match is 0 length
|
||||
next_cursor = (self.cursor + 1).max (match.utf_16_end 0)
|
||||
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
|
||||
Match_Iterator_Value.Next filler_span match next_iterator
|
||||
|
||||
## PRIVATE
|
||||
Returns the remainder of the string, unmatched.
|
||||
early_exit : Match_Iterator_Value
|
||||
early_exit self =
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = Utf_16_Span.Value filler_range self.input
|
||||
Match_Iterator_Value.Last filler_span
|
||||
|
||||
## PRIVATE
|
||||
type Match_Iterator_Value
|
||||
## PRIVATE
|
||||
Next (filler : Utf_16_Span) (match : Match) (next_iterator : Match_Iterator)
|
||||
|
||||
## PRIVATE
|
||||
Last (filler : Utf_16_Span)
|
||||
|
||||
## PRIVATE
|
||||
Convert the polyglot map to a Map.
|
||||
polyglot_map_to_map : Any -> Map Any Any
|
||||
polyglot_map_to_map map =
|
||||
polyglot_keys = Polyglot.get_members map
|
||||
keys = Vector.from_polyglot_array polyglot_keys
|
||||
pairs = keys.map key-> [key, Polyglot.get_member map key]
|
||||
Map.from_vector pairs
|
||||
|
||||
## PRIVATE
|
||||
Get the named group from the polyglot map.
|
||||
read_group_map : Any -> Text -> Integer | Nothing
|
||||
read_group_map polyglot_map name =
|
||||
map = polyglot_map_to_map polyglot_map
|
||||
map.get name
|
||||
|
||||
## PRIVATE
|
||||
match_to_group_maybe : Match | Nothing -> Text | Nothing
|
||||
match_to_group_maybe match =
|
||||
if match.is_nothing then Nothing else match.text 0
|
||||
|
||||
## PRIVATE
|
||||
Build an output string from a Match resulting from `tokenize`.
|
||||
See `tokenize`.
|
||||
build_tokenization_output_from_match : Pattern -> Match -> Text
|
||||
build_tokenization_output_from_match pattern match =
|
||||
if pattern.group_count == 1 then match.text 0 else
|
||||
# Extract the ranges of the spans of all capturing groups
|
||||
group_numbers = 1.up_to pattern.group_count
|
||||
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
|
||||
ranges = spans.map span-> case span of Span.Value range _ -> range
|
||||
|
||||
# Eliminate nested capturing groups by sorting and merging the ranges.
|
||||
top_level_ranges = sort_and_merge_ranges ranges
|
||||
|
||||
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
|
||||
text_all = case spans.at 0 of Span.Value _ text -> text
|
||||
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
|
||||
top_level_spans.map (.text) . join
|
@ -100,7 +100,7 @@ import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Text.Line_Ending_Style.Line_Ending_Style
|
||||
import project.Data.Text.Location.Location
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Regex
|
||||
import project.Data.Text.Text_Ordering.Text_Ordering
|
||||
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
|
||||
import project.Data.Time.Date.Date
|
||||
@ -152,7 +152,7 @@ export project.Data.Text.Encoding.Encoding
|
||||
export project.Data.Text.Line_Ending_Style.Line_Ending_Style
|
||||
export project.Data.Text.Location.Location
|
||||
export project.Data.Text.Matching_Mode.Matching_Mode
|
||||
export project.Data.Text.Regex
|
||||
export project.Data.Text.Regex.Regex
|
||||
export project.Data.Text.Text_Ordering.Text_Ordering
|
||||
export project.Data.Text.Text_Sub_Range.Text_Sub_Range
|
||||
export project.Data.Time.Date.Date
|
||||
|
@ -5,7 +5,6 @@ import project.Data.Table.Table
|
||||
import project.Data.Type.Value_Type.Value_Type
|
||||
import project.Internal.Problem_Builder.Problem_Builder
|
||||
import project.Internal.Unique_Name_Strategy.Unique_Name_Strategy
|
||||
import Standard.Base.Data.Text.Regex.Pattern.Pattern
|
||||
|
||||
from project.Errors import Column_Count_Exceeded, Column_Count_Mismatch, Duplicate_Output_Column_Names, Invalid_Value_Type, Missing_Input_Columns
|
||||
from project.Internal.Java_Exports import make_string_builder
|
||||
@ -72,7 +71,7 @@ parse_to_columns table input_column_id pattern="." case_sensitivity=Case_Sensiti
|
||||
Create a parser from a regex to a nested `Vector`. Each match becomes an
|
||||
element of the vector; each group (or the whole match, if there are no
|
||||
groups) becomes an element of the inner vectors.
|
||||
regex_parse_to_vectors : Pattern -> (Text -> Vector (Vector (Text | Nothing)))
|
||||
regex_parse_to_vectors : Regex -> (Text -> Vector (Vector (Text | Nothing)))
|
||||
regex_parse_to_vectors pattern =
|
||||
input->
|
||||
matches = pattern.match_all input
|
||||
@ -85,7 +84,7 @@ regex_parse_to_vectors pattern =
|
||||
If the regex has no explicit groups, it uses the original column name
|
||||
unchanged; otherwise, it uses the group name if it exists, or the original
|
||||
column name with a number.
|
||||
regex_to_column_names : Pattern -> Text -> Vector Text
|
||||
regex_to_column_names : Regex -> Text -> Vector Text
|
||||
regex_to_column_names pattern original_column_name =
|
||||
case pattern.group_count of
|
||||
1 ->
|
||||
|
@ -3,13 +3,13 @@ import Standard.Base.Data.Text.Span.Span
|
||||
import Standard.Base.Data.Text.Span.Utf_16_Span
|
||||
import Standard.Base.Data.Text.Regex.Match.Match
|
||||
import Standard.Base.Data.Text.Regex.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex.Pattern.Pattern
|
||||
import Standard.Base.Data.Text.Regex.Regex
|
||||
import Standard.Base.Data.Text.Regex.Regex_Syntax_Error
|
||||
import Standard.Base.Data.Text.Regex.Replacer.Replacer
|
||||
import Standard.Base.Data.Text.Regex.Internal.Replacer.Replacer
|
||||
import Standard.Base.Errors.Common.Type_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
|
||||
from Standard.Base.Data.Text.Regex.Internal.Replacer import get_lru_size, replacer_cache_lookup
|
||||
|
||||
from Standard.Test import Test, Test_Suite
|
||||
import Standard.Test.Extensions
|
||||
@ -20,7 +20,7 @@ spec =
|
||||
Test.group "Compile" <|
|
||||
Test.specify "should be able to be compiled" <|
|
||||
pattern = Regex.compile "(?<dots>..)" case_insensitive=True
|
||||
pattern . should_be_a Pattern
|
||||
pattern . should_be_a Regex
|
||||
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
|
||||
Regex.compile "ab(c(((((((" . should_fail_with Regex_Syntax_Error
|
||||
|
Loading…
Reference in New Issue
Block a user