mirror of
https://github.com/enso-org/enso.git
synced 2024-12-23 15:52:05 +03:00
Implement Regular Expression split and update Text.split to the new API (#6116)
Re-implement split on top of Truffle regex.
This commit is contained in:
parent
7f8230b62d
commit
c8f5a91d6c
@ -366,8 +366,10 @@
|
|||||||
- [Aligned names of columns created by column operations.][5850]
|
- [Aligned names of columns created by column operations.][5850]
|
||||||
- [Improved `cross_tab`. Renamed `fill_missing` and `is_missing` to
|
- [Improved `cross_tab`. Renamed `fill_missing` and `is_missing` to
|
||||||
`fill_nothing` and `is_nothing`. Added `fill_empty`.][5863]
|
`fill_nothing` and `is_nothing`. Added `fill_empty`.][5863]
|
||||||
- [Removed many regex compile flags from `replace`; added `only_first`
|
- [Removed many regex compile flags from `replace`; added `only_first` and
|
||||||
flag.][5959]
|
`use_regex` flag.][5959]
|
||||||
|
- [Removed many regex compile flags from `split`; added `only_first` and
|
||||||
|
`use_regex` flag.][6116]
|
||||||
|
|
||||||
[debug-shortcuts]:
|
[debug-shortcuts]:
|
||||||
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
|
||||||
@ -556,6 +558,7 @@
|
|||||||
[5917]: https://github.com/enso-org/enso/pull/5917
|
[5917]: https://github.com/enso-org/enso/pull/5917
|
||||||
[5705]: https://github.com/enso-org/enso/pull/5705
|
[5705]: https://github.com/enso-org/enso/pull/5705
|
||||||
[5959]: https://github.com/enso-org/enso/pull/5959
|
[5959]: https://github.com/enso-org/enso/pull/5959
|
||||||
|
[6116]: https://github.com/enso-org/enso/pull/6116
|
||||||
|
|
||||||
#### Enso Compiler
|
#### Enso Compiler
|
||||||
|
|
||||||
|
@ -115,6 +115,7 @@ invert_range_selection ranges length needs_sorting =
|
|||||||
merged.
|
merged.
|
||||||
|
|
||||||
Empty subranges are discarded.
|
Empty subranges are discarded.
|
||||||
|
sort_and_merge_ranges : Vector Range -> Vector Range
|
||||||
sort_and_merge_ranges ranges =
|
sort_and_merge_ranges ranges =
|
||||||
sorted = ranges.filter (range-> range.is_empty.not) . sort on=(.start)
|
sorted = ranges.filter (range-> range.is_empty.not) . sort on=(.start)
|
||||||
if sorted.is_empty then [] else
|
if sorted.is_empty then [] else
|
||||||
|
@ -15,13 +15,11 @@ import project.Data.Text.Location.Location
|
|||||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||||
import project.Data.Text.Regex.Match.Match
|
import project.Data.Text.Regex.Match.Match
|
||||||
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
||||||
import project.Data.Text.Regex_Matcher.Regex_Matcher
|
|
||||||
import project.Data.Text.Regex_2
|
import project.Data.Text.Regex_2
|
||||||
import project.Data.Text.Regex_2.Regex_Syntax_Error
|
import project.Data.Text.Regex_2.Regex_Syntax_Error
|
||||||
import project.Data.Text.Span.Span
|
import project.Data.Text.Span.Span
|
||||||
import project.Data.Text.Span.Utf_16_Span
|
import project.Data.Text.Span.Utf_16_Span
|
||||||
import project.Data.Text.Text
|
import project.Data.Text.Text
|
||||||
import project.Data.Text.Text_Matcher.Text_Matcher
|
|
||||||
import project.Data.Text.Text_Sub_Range.Codepoint_Ranges
|
import project.Data.Text.Text_Sub_Range.Codepoint_Ranges
|
||||||
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
|
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
|
||||||
import project.Data.Vector.Vector
|
import project.Data.Vector.Vector
|
||||||
@ -237,7 +235,7 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
|||||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||||
compiled_pattern.if_not_error <| compiled_pattern.match self
|
compiled_pattern.match self
|
||||||
|
|
||||||
## Finds all the matches of the regular expression `pattern` in `self`,
|
## Finds all the matches of the regular expression `pattern` in `self`,
|
||||||
returning a Vector. If not found, will be an empty Vector.
|
returning a Vector. If not found, will be an empty Vector.
|
||||||
@ -265,7 +263,7 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
|||||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||||
compiled_pattern.if_not_error <| compiled_pattern.match_all self
|
compiled_pattern.match_all self
|
||||||
|
|
||||||
## ALIAS Check Matches
|
## ALIAS Check Matches
|
||||||
|
|
||||||
@ -296,7 +294,7 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
|||||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||||
compiled_pattern.if_not_error <| compiled_pattern.matches self
|
compiled_pattern.matches self
|
||||||
|
|
||||||
## ALIAS Split Text
|
## ALIAS Split Text
|
||||||
|
|
||||||
@ -305,34 +303,34 @@ Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
|||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
- delimiter: The pattern used to split the text.
|
- delimiter: The pattern used to split the text.
|
||||||
- matcher: If a `Text_Matcher`, the text is compared using case-sensitivity
|
- case_sensitivity: Specifies if the text values should be compared case
|
||||||
rules specified in the matcher. If a `Regex_Matcher`, the term is used as a
|
sensitively. The values are compared case sensitively by default.
|
||||||
regular expression and matched using the associated options.
|
- only_first: If true, only replace the first match.
|
||||||
|
- use_regex: If true, the term is used as a regular expression.
|
||||||
|
|
||||||
> Example
|
> Example
|
||||||
Split the text on any occurrence of the separator `"::"`.
|
Split the text on any occurrence of the separator `"::"`.
|
||||||
|
|
||||||
example_split =
|
text = "Namespace::package::package::Type"
|
||||||
text = "Namespace::package::package::Type"
|
text.split "::" == ["Namespace", "package", "package", "Type"]
|
||||||
text.split "::" == ["Namespace", "package", "package", "Type"]
|
|
||||||
|
|
||||||
> Example
|
> Example
|
||||||
Split the text on a regex pattern.
|
Split the text on a regex pattern.
|
||||||
|
|
||||||
"abc--def==>ghi".split "[-=>]+" Regex_Matcher.Value == ["abc", "def", "ghi"]
|
"abc--def==>ghi".split "[-=>]+" use_regex=True == ["abc", "def", "ghi"]
|
||||||
|
|
||||||
> Example
|
> Example
|
||||||
Split the text on any whitespace.
|
Split the text on any whitespace.
|
||||||
|
|
||||||
'abc def\tghi'.split '\\s+' Regex_Matcher.Value == ["abc", "def", "ghi"]
|
'abc def\tghi'.split '\\s+' use_regex=True == ["abc", "def", "ghi"]
|
||||||
Text.split : Text -> (Text_Matcher | Regex_Matcher) -> Vector Text
|
Text.split : Text -> Case_Sensitivity -> Boolean -> Boolean -> Vector Text | Illegal_Argument
|
||||||
Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter.is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
|
Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive only_first=False use_regex=False = if delimiter.is_empty then Error.throw (Illegal_Argument.Error "The delimiter cannot be empty.") else
|
||||||
case matcher of
|
case use_regex of
|
||||||
_ : Text_Matcher ->
|
False ->
|
||||||
delimiters = Vector.from_polyglot_array <| case matcher of
|
delimiters = Vector.from_polyglot_array <| case case_sensitivity of
|
||||||
Text_Matcher.Case_Sensitive ->
|
Case_Sensitivity.Sensitive ->
|
||||||
Text_Utils.span_of_all self delimiter
|
Text_Utils.span_of_all self delimiter
|
||||||
Text_Matcher.Case_Insensitive locale ->
|
Case_Sensitivity.Insensitive locale ->
|
||||||
Text_Utils.span_of_all_case_insensitive self delimiter locale.java_locale
|
Text_Utils.span_of_all_case_insensitive self delimiter locale.java_locale
|
||||||
Vector.new delimiters.length+1 i->
|
Vector.new delimiters.length+1 i->
|
||||||
start = if i == 0 then 0 else
|
start = if i == 0 then 0 else
|
||||||
@ -340,9 +338,11 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
|
|||||||
end = if i == delimiters.length then (Text_Utils.char_length self) else
|
end = if i == delimiters.length then (Text_Utils.char_length self) else
|
||||||
delimiters.at i . codeunit_start
|
delimiters.at i . codeunit_start
|
||||||
Text_Utils.substring self start end
|
Text_Utils.substring self start end
|
||||||
_ : Regex_Matcher ->
|
True ->
|
||||||
compiled_pattern = matcher.compile delimiter
|
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||||
compiled_pattern.split self mode=Regex_Mode.All
|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||||
|
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
|
||||||
|
compiled_pattern.split self only_first
|
||||||
|
|
||||||
## ALIAS Replace Text
|
## ALIAS Replace Text
|
||||||
Perform a text or regex replace.
|
Perform a text or regex replace.
|
||||||
@ -360,9 +360,8 @@ Text.split self delimiter="," matcher=Text_Matcher.Case_Sensitive = if delimiter
|
|||||||
Arguments:
|
Arguments:
|
||||||
- term: The string or regex to find.
|
- term: The string or regex to find.
|
||||||
- replacement: The text to replace matches with.
|
- replacement: The text to replace matches with.
|
||||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
- case_sensitivity: Specifies if the text values should be compared case
|
||||||
insensitive matching behaves as if it normalises the case of all input
|
sensitively.
|
||||||
text before matching on it.
|
|
||||||
- only_first: If True, only replace the first match.
|
- only_first: If True, only replace the first match.
|
||||||
- use_regex: If true, the term is used as a regular expression.
|
- use_regex: If true, the term is used as a regular expression.
|
||||||
|
|
||||||
@ -438,8 +437,7 @@ Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive o
|
|||||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||||
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
|
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
|
||||||
compiled_pattern.if_not_error <|
|
compiled_pattern.replace self replacement only_first
|
||||||
compiled_pattern.replace self replacement only_first
|
|
||||||
|
|
||||||
## ALIAS Get Words
|
## ALIAS Get Words
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ import project.Nothing.Nothing
|
|||||||
import project.Polyglot.Polyglot
|
import project.Polyglot.Polyglot
|
||||||
|
|
||||||
from project.Data.Boolean import Boolean, True, False
|
from project.Data.Boolean import Boolean, True, False
|
||||||
|
from project.Data.Index_Sub_Range import sort_and_merge_ranges
|
||||||
|
|
||||||
polyglot java import org.enso.base.Replacer_Cache
|
polyglot java import org.enso.base.Replacer_Cache
|
||||||
polyglot java import org.enso.base.Text_Utils
|
polyglot java import org.enso.base.Text_Utils
|
||||||
@ -66,7 +67,7 @@ type Pattern_2
|
|||||||
go it = case it.next of
|
go it = case it.next of
|
||||||
Match_Iterator_Value.Next _ match next_it ->
|
Match_Iterator_Value.Next _ match next_it ->
|
||||||
builder.append match
|
builder.append match
|
||||||
go next_it
|
@Tail_Call go next_it
|
||||||
Match_Iterator_Value.Last _ -> Nothing
|
Match_Iterator_Value.Last _ -> Nothing
|
||||||
go it
|
go it
|
||||||
builder.to_vector
|
builder.to_vector
|
||||||
@ -93,6 +94,79 @@ type Pattern_2
|
|||||||
find_all self input =
|
find_all self input =
|
||||||
self.match_all input . map match_to_group_maybe
|
self.match_all input . map match_to_group_maybe
|
||||||
|
|
||||||
|
## ADVANCED
|
||||||
|
|
||||||
|
Splits the `input` text based on the pattern described by `self`.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- input: The text to split based on the pattern described by `self`.
|
||||||
|
- only_first: If True, only split at the first occurrence.
|
||||||
|
|
||||||
|
This method will _always_ return a vector. If no splits take place, the
|
||||||
|
vector will contain a single element (equal to the original string).
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Split on the first instance of the pattern.
|
||||||
|
pattern = Regex_2.compile "cd"
|
||||||
|
input = "abcdefcdghij"
|
||||||
|
texts = pattern.split input only_first=True
|
||||||
|
texts . should_equal ["ab", "efcdghij"]
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Split on the all instances of the pattern in the input.
|
||||||
|
pattern = Regex_2.compile "a"
|
||||||
|
input = "bacadaeaf"
|
||||||
|
texts = pattern.split input
|
||||||
|
texts . should_equal ["b", "c", "d", "e", "f"]
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Returns the original text if there are no matches.
|
||||||
|
pattern = Regex_2.compile "aa"
|
||||||
|
input = "abcdefghij"
|
||||||
|
texts = pattern.split input
|
||||||
|
texts . should_equal ["abcdefghij"]
|
||||||
|
split : Text -> Boolean -> Vector Text
|
||||||
|
split self input only_first=False =
|
||||||
|
builder = Vector.new_builder
|
||||||
|
it = Match_Iterator.new self input
|
||||||
|
go next = case next of
|
||||||
|
Match_Iterator_Value.Next filler _ next_it ->
|
||||||
|
builder.append filler.text
|
||||||
|
next = if only_first then next_it.early_exit else next_it.next
|
||||||
|
@Tail_Call go next
|
||||||
|
Match_Iterator_Value.Last filler ->
|
||||||
|
builder.append filler.text
|
||||||
|
go it.next
|
||||||
|
builder.to_vector
|
||||||
|
|
||||||
|
## ADVANCED
|
||||||
|
|
||||||
|
Takes an input string and returns all the matches as a `Vector Text`.
|
||||||
|
If the pattern contains marked groups, the values are concatenated
|
||||||
|
together; otherwise the whole match is returned.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- input: The text to tokenize.
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Split to blocks of 3 characters.
|
||||||
|
|
||||||
|
Regex_2.compile '...' . tokenize 'ABCDEF' == ['ABC','DEF']
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Split to blocks of 3 characters taking first and third letters.
|
||||||
|
|
||||||
|
Regex_2.compile '(.).(.)' . tokenize 'ABCDEF' == ['AC','DF']
|
||||||
|
|
||||||
|
> Example
|
||||||
|
Split a text on any white space.
|
||||||
|
|
||||||
|
Regex_2.compile '(\S+)(?:\s+|$)' . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!'
|
||||||
|
== ['Hello','Big','Wide','World','Goodbye!']
|
||||||
|
tokenize : Text -> Vector Text
|
||||||
|
tokenize self input =
|
||||||
|
self.match_all input . map (build_tokenization_output_from_match self _)
|
||||||
|
|
||||||
## ADVANCED
|
## ADVANCED
|
||||||
|
|
||||||
Replace all occurrences of the pattern described by `self` in the `input`
|
Replace all occurrences of the pattern described by `self` in the `input`
|
||||||
@ -150,7 +224,6 @@ type Pattern_2
|
|||||||
|
|
||||||
pattern = Regex_2.compile "([a-z]+)"
|
pattern = Regex_2.compile "([a-z]+)"
|
||||||
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
|
pattern.replace "foo bar, baz" "[$1]" == "[foo] [bar], [baz]"
|
||||||
|
|
||||||
replace : Text -> Text -> Boolean -> Text
|
replace : Text -> Text -> Boolean -> Text
|
||||||
replace self input replacement only_first=False =
|
replace self input replacement only_first=False =
|
||||||
it = Match_Iterator.new self input
|
it = Match_Iterator.new self input
|
||||||
@ -329,3 +402,22 @@ read_group_map polyglot_map name =
|
|||||||
match_to_group_maybe : Match_2 | Nothing -> Text | Nothing
|
match_to_group_maybe : Match_2 | Nothing -> Text | Nothing
|
||||||
match_to_group_maybe match =
|
match_to_group_maybe match =
|
||||||
if match.is_nothing then Nothing else match.text 0
|
if match.is_nothing then Nothing else match.text 0
|
||||||
|
|
||||||
|
## PRIVATE
|
||||||
|
Build an output string from a Match_2 resulting from `tokenize`.
|
||||||
|
See `tokenize`.
|
||||||
|
build_tokenization_output_from_match : Pattern_2 -> Match_2 -> Text
|
||||||
|
build_tokenization_output_from_match pattern match =
|
||||||
|
if pattern.group_count == 1 then match.text 0 else
|
||||||
|
# Extract the ranges of the spans of all capturing groups
|
||||||
|
group_numbers = 1.up_to pattern.group_count
|
||||||
|
spans = group_numbers.map n-> match.span n
|
||||||
|
ranges = spans.map span-> case span of Span.Value range _ -> range
|
||||||
|
|
||||||
|
# Eliminate nested capturing groups by sorting and merging the ranges.
|
||||||
|
top_level_ranges = sort_and_merge_ranges ranges
|
||||||
|
|
||||||
|
# Reconstruct `Spans` from the synthesized `Ranges`, and concatenate.
|
||||||
|
text_all = case spans.at 0 of Span.Value _ text -> text
|
||||||
|
top_level_spans = top_level_ranges.map range-> Span.Value range text_all
|
||||||
|
top_level_spans.map (.text) . join
|
||||||
|
@ -8,6 +8,7 @@ import Standard.Base.Data.Text.Regex_2
|
|||||||
import Standard.Base.Data.Text.Regex_2.No_Such_Group
|
import Standard.Base.Data.Text.Regex_2.No_Such_Group
|
||||||
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
|
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
|
||||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||||
|
import Standard.Base.IO
|
||||||
|
|
||||||
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
|
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
|
||||||
|
|
||||||
@ -17,6 +18,13 @@ import Standard.Test.Extensions
|
|||||||
polyglot java import org.enso.base.Replacer_Cache
|
polyglot java import org.enso.base.Replacer_Cache
|
||||||
|
|
||||||
spec =
|
spec =
|
||||||
|
Test.group "gmt" <|
|
||||||
|
Test.specify "asdf" <|
|
||||||
|
IO.println <| Regex_2.compile 's\u{301}' . replace 'sśs\u{301}' '-'
|
||||||
|
IO.println <| Regex_2.compile 'a\u{301}' . match_all "aááêe xêy"
|
||||||
|
#Regex_2.compile 'a\u{301}+' . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||||
|
Regex_2.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||||
|
|
||||||
Test.group "Compile" <|
|
Test.group "Compile" <|
|
||||||
Test.specify "should be able to be compiled" <|
|
Test.specify "should be able to be compiled" <|
|
||||||
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
|
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
|
||||||
@ -136,55 +144,77 @@ spec =
|
|||||||
pattern = Regex_2.compile ""
|
pattern = Regex_2.compile ""
|
||||||
pattern.find_all "ABC" . should_fail_with Illegal_Argument
|
pattern.find_all "ABC" . should_fail_with Illegal_Argument
|
||||||
|
|
||||||
##
|
Test.group "Pattern_2.split" <|
|
||||||
Test.group "The default regex engine's Pattern.split" <|
|
Test.specify "should be able to `split` on the first instance of the pattern" <|
|
||||||
engine = Default_Engine.new
|
pattern = Regex_2.compile "cd"
|
||||||
|
input = "abcdefcdghij"
|
||||||
|
texts = pattern.split input only_first=True
|
||||||
|
texts . should_equal ["ab", "efcdghij"]
|
||||||
|
|
||||||
Test.specify "should be able to `split` on the first instance of the pattern" <|
|
Test.specify "should return the original text if there are no matches in first mode" <|
|
||||||
pattern = engine.compile "cd" []
|
pattern = Regex_2.compile "aa"
|
||||||
input = "abcdefghij"
|
input = "abcdefghij"
|
||||||
match = pattern.split input mode=Matching_Mode.First
|
texts = pattern.split input only_first=True
|
||||||
match.length . should_equal 2
|
texts . should_equal ["abcdefghij"]
|
||||||
match.at 0 . should_equal "ab"
|
|
||||||
match.at 1 . should_equal "efghij"
|
|
||||||
|
|
||||||
Test.specify "should return the original text if there are no matches in first mode" <|
|
Test.specify "should return the original text if there are no matches in all mode" <|
|
||||||
pattern = engine.compile "(aa)" []
|
pattern = Regex_2.compile "aa"
|
||||||
input = "abcdefghij"
|
input = "abcdefghij"
|
||||||
match = pattern.split input mode=Matching_Mode.First
|
texts = pattern.split input
|
||||||
match . should_equal ["abcdefghij"]
|
texts . should_equal ["abcdefghij"]
|
||||||
|
|
||||||
Test.specify "should be able to `split` on at most N instances of the pattern in the input" <|
|
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
|
||||||
pattern = engine.compile "a" []
|
pattern = Regex_2.compile "a"
|
||||||
input = "bacadaeaf"
|
pattern.split "bacadaeaf" . should_equal ["b", "c", "d", "e", "f"]
|
||||||
match = pattern.split input mode=3
|
pattern.split "baab" . should_equal ["b", "", "b"]
|
||||||
match.length . should_equal 4
|
pattern.split "aaa" . should_equal ["", "", "", ""]
|
||||||
match.at 0 . should_equal "b"
|
pattern.split "" . should_equal [""]
|
||||||
match.at 1 . should_equal "c"
|
pattern.split "a" . should_equal ["", ""]
|
||||||
match.at 2 . should_equal "d"
|
pattern.split "abaca" . should_equal ["", "b", "c", ""]
|
||||||
match.at 3 . should_equal "eaf"
|
|
||||||
|
|
||||||
Test.specify "should `split` on fewer than N instances when there are fewer than N in the input" <|
|
Test.specify "should split without normalization" <|
|
||||||
pattern = engine.compile "a" []
|
pattern = Regex_2.compile "s"
|
||||||
input = "bacadaeaf"
|
pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb']
|
||||||
match = pattern.split input mode=10
|
|
||||||
match.length . should_equal 5
|
|
||||||
match.at 0 . should_equal "b"
|
|
||||||
match.at 1 . should_equal "c"
|
|
||||||
match.at 2 . should_equal "d"
|
|
||||||
match.at 3 . should_equal "e"
|
|
||||||
match.at 4 . should_equal "f"
|
|
||||||
|
|
||||||
Test.specify "should be able to `split` on the all instances of the pattern in the input" <|
|
Test.group "Pattern_2.tokenize" <|
|
||||||
pattern = engine.compile "(a)" []
|
Test.specify "can tokenize simple regexes without capturing groups"
|
||||||
input = "bacadaeaf"
|
Regex_2.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"]
|
||||||
match = pattern.split input mode=Regex_Mode.All
|
Regex_2.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"]
|
||||||
match.length . should_equal 5
|
Regex_2.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"]
|
||||||
match.at 0 . should_equal "b"
|
|
||||||
match.at 1 . should_equal "c"
|
Test.specify "can tokenize regexes with capturing groups"
|
||||||
match.at 2 . should_equal "d"
|
Regex_2.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||||
match.at 3 . should_equal "e"
|
Regex_2.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||||
match.at 4 . should_equal "f"
|
Regex_2.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""]
|
||||||
|
|
||||||
|
Test.specify "ignores non-capturing groups"
|
||||||
|
Regex_2.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||||
|
Regex_2.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||||
|
Regex_2.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||||
|
|
||||||
|
Test.specify "ignores nested groups"
|
||||||
|
Regex_2.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||||
|
Regex_2.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||||
|
Regex_2.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"]
|
||||||
|
|
||||||
|
Test.specify "handles unicode" <|
|
||||||
|
Regex_2.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||||
|
#fail
|
||||||
|
#Regex_2.compile '[a\u{301}e\u{301}]+' . tokenize 'aááêe xêy' . should_equal ['a\u{301}a\u{301}e\u{301}', 'e\u{301}']
|
||||||
|
#Regex_2.compile '(?:a\u{301})+' . tokenize 'aááêe xêy' . should_equal ['a\u{301}a\u{301}']
|
||||||
|
#Regex_2.compile 'a\u{301}' . tokenize 'aááêe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||||
|
# Wrong
|
||||||
|
Regex_2.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||||
|
# Fails
|
||||||
|
Regex_2.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||||
|
# fails
|
||||||
|
#Regex_2.compile "a\u{301}+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||||
|
Regex_2.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"]
|
||||||
|
|
||||||
|
Test.specify "examples are correct" <|
|
||||||
|
Regex_2.compile "..." . tokenize "ABCDEF" . should_equal ["ABC","DEF"]
|
||||||
|
Regex_2.compile "(.).(.)" . tokenize "ABCDEF" . should_equal ["AC","DF"]
|
||||||
|
Regex_2.compile "(\S+)(?:\s+|$)" . tokenize 'Hello Big\r\nWide\tWorld\nGoodbye!' . should_equal ["Hello","Big","Wide","World","Goodbye!"]
|
||||||
|
|
||||||
Test.group "Pattern_2.replace" <|
|
Test.group "Pattern_2.replace" <|
|
||||||
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
|
Test.specify "should be able to `replace` the first instance of the pattern in the input" <|
|
||||||
|
@ -242,30 +242,42 @@ spec =
|
|||||||
'abc'.split '' . should_fail_with Illegal_Argument
|
'abc'.split '' . should_fail_with Illegal_Argument
|
||||||
|
|
||||||
Test.specify "should be able to split the text on arbitrary text sequence, case-insensitively" <|
|
Test.specify "should be able to split the text on arbitrary text sequence, case-insensitively" <|
|
||||||
matcher = Text_Matcher.Case_Insensitive
|
"AbCdABCDabDCba" . split "ab" case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["", "Cd", "CD", "DCba"]
|
||||||
"AbCdABCDabDCba" . split "ab" matcher . should_equal ["", "Cd", "CD", "DCba"]
|
"abc".split "d" case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["abc"]
|
||||||
"abc".split "d" matcher . should_equal ["abc"]
|
"AAA".split "a" case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["", "", "", ""]
|
||||||
"AAA".split "a" matcher . should_equal ["", "", "", ""]
|
"baB".split "b" case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["", "a", ""]
|
||||||
"baB".split "b" matcher . should_equal ["", "a", ""]
|
"".split "a" case_sensitivity=Case_Sensitivity.Insensitive . should_equal [""]
|
||||||
"".split "a" matcher . should_equal [""]
|
'aŚbS\u{301}c'.split 'ś' case_sensitivity=Case_Sensitivity.Insensitive . should_equal ['a', 'b', 'c']
|
||||||
'aŚbS\u{301}c'.split 'ś' matcher . should_equal ['a', 'b', 'c']
|
'abc'.split '' case_sensitivity=Case_Sensitivity.Insensitive . should_fail_with Illegal_Argument
|
||||||
'abc'.split '' matcher . should_fail_with Illegal_Argument
|
|
||||||
|
|
||||||
Test.specify "should be able to split the text on Regex patterns" <|
|
Test.specify "should be able to split the text on Regex patterns" <|
|
||||||
"cababdabe" . split "ab" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ["c", "", "d", "e"]
|
"cababdabe" . split "ab" use_regex=True . should_equal ["c", "", "d", "e"]
|
||||||
"cababdabe" . split "(ab)+" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ["c", "d", "e"]
|
"cababdabe" . split "(ab)+" use_regex=True . should_equal ["c", "d", "e"]
|
||||||
"abc" . split "[a-z]" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ["", "", "", ""]
|
"abc" . split "[a-z]" use_regex=True . should_equal ["", "", "", ""]
|
||||||
"abc--def==>ghi".split "[-=>]+" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) == ["abc", "def", "ghi"]
|
"abc--def==>ghi".split "[-=>]+" use_regex=True == ["abc", "def", "ghi"]
|
||||||
"abc".split "." (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ["", "", "", ""]
|
"abc".split "." use_regex=True . should_equal ["", "", "", ""]
|
||||||
"abc".split "d" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ["abc"]
|
"abc".split "d" use_regex=True . should_equal ["abc"]
|
||||||
".a.".split "\." (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ["", "a", ""]
|
".a.".split "\." use_regex=True . should_equal ["", "a", ""]
|
||||||
"".split "a" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal [""]
|
"".split "a" use_regex=True . should_equal [""]
|
||||||
'aśbs\u{301}c'.split 'ś' (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ['a', 'b', 'c']
|
'abc'.split '' use_regex=True . should_fail_with Illegal_Argument
|
||||||
'abc'.split '' (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_fail_with Illegal_Argument
|
|
||||||
|
Test.specify "should be able to split the text on Regex patterns, case-insensitively" <|
|
||||||
|
"CAbaBDaBe" . split "ab" use_regex=True case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["C", "", "D", "e"]
|
||||||
|
"caBAbdAbe" . split "(ab)+" use_regex=True case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["c", "d", "e"]
|
||||||
|
"ABc" . split "[a-z]" use_regex=True case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["", "", "", ""]
|
||||||
|
|
||||||
|
Test.specify "regex and non-regex `split` handle accented grapheme splitting differently" <|
|
||||||
|
'aśbs\u{301}c'.split 'ś' use_regex=True . should_equal ['a', 'bs\u{301}c']
|
||||||
|
'aśbs\u{301}c'.split 'ś' . should_equal ['a', 'b', 'c']
|
||||||
|
|
||||||
Test.specify "should be able to split the text on UTF-8 whitespace" <|
|
Test.specify "should be able to split the text on UTF-8 whitespace" <|
|
||||||
utf_8_whitespace.split "\s+" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal utf_8_whitespace_split
|
utf_8_whitespace.split "\s+" use_regex=True . should_equal utf_8_whitespace_split
|
||||||
'abc def\tghi'.split '\\s+' (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Sensitive) . should_equal ["abc", "def", "ghi"]
|
'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]
|
||||||
|
|
||||||
|
Test.specify "exmples should be correct" <|
|
||||||
|
"Namespace::package::package::Type".split "::" . should_equal ["Namespace", "package", "package", "Type"]
|
||||||
|
"abc--def==>ghi".split "[-=>]+" use_regex=True . should_equal ["abc", "def", "ghi"]
|
||||||
|
'abc def\tghi'.split '\\s+' use_regex=True . should_equal ["abc", "def", "ghi"]
|
||||||
|
|
||||||
Test.specify "should convert any type to text automatically and using provided methods" <|
|
Test.specify "should convert any type to text automatically and using provided methods" <|
|
||||||
t = Auto.Value (Manual.Value 123) . to_text
|
t = Auto.Value (Manual.Value 123) . to_text
|
||||||
@ -1363,53 +1375,26 @@ spec =
|
|||||||
|
|
||||||
Test.group "Regex splitting" <|
|
Test.group "Regex splitting" <|
|
||||||
Test.specify "should be possible on text" <|
|
Test.specify "should be possible on text" <|
|
||||||
splits = "abcde".split "[bd]" Regex_Matcher.Value
|
splits = "abcde".split "[bd]" use_regex=True
|
||||||
splits.length . should_equal 3
|
splits.length . should_equal 3
|
||||||
splits.at 0 . should_equal "a"
|
splits.at 0 . should_equal "a"
|
||||||
splits.at 1 . should_equal "c"
|
splits.at 1 . should_equal "c"
|
||||||
splits.at 2 . should_equal "e"
|
splits.at 2 . should_equal "e"
|
||||||
|
|
||||||
Test.specify "should be possible on unicode text" <|
|
Test.specify "should be possible on unicode text" <|
|
||||||
match = "Korean: 건반 (hangul)".split " " Regex_Matcher.Value
|
match = "Korean: 건반 (hangul)".split " " use_regex=True
|
||||||
match.length . should_equal 3
|
match.length . should_equal 3
|
||||||
match.at 0 . should_equal "Korean:"
|
match.at 0 . should_equal "Korean:"
|
||||||
match.at 1 . should_equal "건반"
|
match.at 1 . should_equal "건반"
|
||||||
match.at 2 . should_equal "(hangul)"
|
match.at 2 . should_equal "(hangul)"
|
||||||
|
|
||||||
Test.specify "should be possible in ascii mode" <|
|
|
||||||
splits = "İiİ".split "\w" (Regex_Matcher.Value match_ascii=True)
|
|
||||||
splits.length . should_equal 2
|
|
||||||
splits.at 0 . should_equal "İ"
|
|
||||||
splits.at 1 . should_equal "İ"
|
|
||||||
|
|
||||||
Test.specify "should be possible in case-insensitive mode" <|
|
Test.specify "should be possible in case-insensitive mode" <|
|
||||||
splits = "abaBa".split "b" (Regex_Matcher.Value case_sensitivity=Case_Sensitivity.Insensitive)
|
splits = "abaBa".split "b" use_regex=True case_sensitivity=Case_Sensitivity.Insensitive
|
||||||
splits.length . should_equal 3
|
splits.length . should_equal 3
|
||||||
splits.at 0 . should_equal "a"
|
splits.at 0 . should_equal "a"
|
||||||
splits.at 1 . should_equal "a"
|
splits.at 1 . should_equal "a"
|
||||||
splits.at 2 . should_equal "a"
|
splits.at 2 . should_equal "a"
|
||||||
|
|
||||||
Test.specify "should be possible in dot_matches_newline mode" <|
|
|
||||||
splits = 'ab\nabcd'.split "b." (Regex_Matcher.Value dot_matches_newline=True)
|
|
||||||
splits.length . should_equal 3
|
|
||||||
splits.at 0 . should_equal "a"
|
|
||||||
splits.at 1 . should_equal "a"
|
|
||||||
splits.at 2 . should_equal "d"
|
|
||||||
|
|
||||||
Test.specify "should be possible in multiline mode" <|
|
|
||||||
text = """
|
|
||||||
Foo
|
|
||||||
bar
|
|
||||||
match = text.split "$" (Regex_Matcher.Value multiline=True)
|
|
||||||
match.length . should_equal 3
|
|
||||||
|
|
||||||
Test.specify "should be possible in comments mode" <|
|
|
||||||
splits = "abcde".split "[bd] # Split on the letters `b` and `d`" (Regex_Matcher.Value comments=True)
|
|
||||||
splits.length . should_equal 3
|
|
||||||
splits.at 0 . should_equal "a"
|
|
||||||
splits.at 1 . should_equal "c"
|
|
||||||
splits.at 2 . should_equal "e"
|
|
||||||
|
|
||||||
Test.group "Text.replace" <|
|
Test.group "Text.replace" <|
|
||||||
Test.specify "should work as in examples" <|
|
Test.specify "should work as in examples" <|
|
||||||
'aaa'.replace 'aa' 'b' . should_equal 'ba'
|
'aaa'.replace 'aa' 'b' . should_equal 'ba'
|
||||||
@ -1486,7 +1471,7 @@ spec =
|
|||||||
|
|
||||||
"Korean: 건반".replace "건반" "keyboard" . should_equal "Korean: keyboard"
|
"Korean: 건반".replace "건반" "keyboard" . should_equal "Korean: keyboard"
|
||||||
|
|
||||||
Test.specify "regex and non-regex replace handle accented grapheme splitting differently" <|
|
Test.specify "regex and non-regex `replace` handle accented grapheme splitting differently" <|
|
||||||
'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
|
'sśs\u{301}' . replace 's' 'O' . should_equal 'Ośs\u{301}'
|
||||||
'sśs\u{301}' . replace 's' 'O' use_regex=True . should_equal 'OśO\u{301}'
|
'sśs\u{301}' . replace 's' 'O' use_regex=True . should_equal 'OśO\u{301}'
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user