mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 22:10:15 +03:00
parent
310a2d8ae7
commit
6766389cd7
@ -368,9 +368,10 @@
|
||||
`fill_nothing` and `is_nothing`. Added `fill_empty`.][5863]
|
||||
- [Removed many regex compile flags from `replace`; added `only_first` and
|
||||
`use_regex` flag.][5959]
|
||||
- [Implemented proper support for Value Types in the Table library.][6073]
|
||||
- [Removed many regex compile flags from `split`; added `only_first` and
|
||||
`use_regex` flag.][6116]
|
||||
- [Implemented proper support for Value Types in the Table library.][6073]
|
||||
- [Added `Text.tokenize`][6150]
|
||||
- [Added support for Date/Time columns in the Postgres backend and added
|
||||
`year`/`month`/`day` operations to Table columns.][6153]
|
||||
|
||||
@ -561,8 +562,9 @@
|
||||
[5917]: https://github.com/enso-org/enso/pull/5917
|
||||
[5705]: https://github.com/enso-org/enso/pull/5705
|
||||
[5959]: https://github.com/enso-org/enso/pull/5959
|
||||
[6116]: https://github.com/enso-org/enso/pull/6116
|
||||
[6073]: https://github.com/enso-org/enso/pull/6073
|
||||
[6116]: https://github.com/enso-org/enso/pull/6116
|
||||
[6150]: https://github.com/enso-org/enso/pull/6150
|
||||
[6153]: https://github.com/enso-org/enso/pull/6153
|
||||
|
||||
#### Enso Compiler
|
||||
|
@ -1,6 +1,8 @@
|
||||
import project.Data.Locale.Locale
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Text
|
||||
import project.Error.Error
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
|
||||
@ -31,12 +33,18 @@ type Case_Sensitivity
|
||||
TextFoldingStrategy.caseInsensitiveFold locale.java_locale
|
||||
|
||||
## PRIVATE
|
||||
Is case insensitive when in memory.
|
||||
Is case insensitive when in memory. This requires that, if
|
||||
case-insensitive, that the locale be the default locale, and if it's not,
|
||||
throws Illegal_Argument.
|
||||
is_case_insensitive_in_memory : Boolean
|
||||
is_case_insensitive_in_memory self = case self of
|
||||
Case_Sensitivity.Default -> False
|
||||
Case_Sensitivity.Sensitive -> False
|
||||
Case_Sensitivity.Insensitive _ -> True
|
||||
Case_Sensitivity.Insensitive locale -> case locale == Locale.default of
|
||||
True -> True
|
||||
False ->
|
||||
msg = "Custom locales are not supported for this operationc."
|
||||
Error.throw (Illegal_Argument.Error msg)
|
||||
|
||||
## PRIVATE
|
||||
Create matcher function
|
||||
|
@ -10,7 +10,6 @@ import project.Data.Range.Range
|
||||
import project.Data.Text.Case.Case
|
||||
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Text.Helpers
|
||||
import project.Data.Text.Location.Location
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
@ -232,10 +231,9 @@ Text.characters self =
|
||||
"aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
|
||||
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.match self
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.match self
|
||||
|
||||
## Finds all the matches of the regular expression `pattern` in `self`,
|
||||
returning a Vector. If not found, will be an empty Vector.
|
||||
@ -260,10 +258,9 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
"aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
|
||||
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.match_all self
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.match_all self
|
||||
|
||||
## ALIAS Check Matches
|
||||
|
||||
@ -291,10 +288,9 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
"CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
|
||||
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error | Illegal_Argument
|
||||
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.matches self
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.matches self
|
||||
|
||||
## ALIAS Split Text
|
||||
|
||||
@ -339,10 +335,40 @@ Text.split self delimiter="," case_sensitivity=Case_Sensitivity.Sensitive only_f
|
||||
delimiters.at i . codeunit_start
|
||||
Text_Utils.substring self start end
|
||||
True ->
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
|
||||
compiled_pattern.split self only_first
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile delimiter case_insensitive=case_insensitive
|
||||
compiled_pattern.split self only_first
|
||||
|
||||
## ADVANCED
|
||||
Takes an input string and and a pattern and returns all the matches as a
|
||||
`Vector Text`. If the pattern contains marked groups, the values are
|
||||
concatenated together; otherwise the whole match is returned.
|
||||
|
||||
Arguments:
|
||||
- input: The text to tokenize.
|
||||
- case_sensitivity: Specifies if the text values should be compared case
|
||||
sensitively. The values are compared case sensitively by default.
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters.
|
||||
|
||||
"ABCDEF" . tokenize "..." == ["ABC","DEF"]
|
||||
|
||||
> Example
|
||||
Split to blocks of 3 characters taking first and third letters.
|
||||
|
||||
"ABCDEF" . tokenize "(.).(.)" == ["AC","DF"]
|
||||
|
||||
> Example
|
||||
Split a text on any white space.
|
||||
|
||||
'Hello Big\r\nWide\tWorld\nGoodbye!' . tokenize "(\S+)(?:\s+|$)"
|
||||
== ["Hello","Big","Wide","World","Goodbye!"]
|
||||
Text.tokenize : Text -> Case_Sensitivity -> Vector Text
|
||||
Text.tokenize self pattern="." case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.tokenize self
|
||||
|
||||
## ALIAS Replace Text
|
||||
Perform a text or regex replace.
|
||||
@ -434,10 +460,9 @@ Text.replace self term replacement case_sensitivity=Case_Sensitivity.Sensitive o
|
||||
Text_Utils.span_of_case_insensitive self term locale.java_locale False
|
||||
Text_Utils.replace_spans self spans_array replacement
|
||||
True ->
|
||||
Helpers.regex_assume_default_locale case_sensitivity <|
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
|
||||
compiled_pattern.replace self replacement only_first
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex_2.compile term case_insensitive=case_insensitive
|
||||
compiled_pattern.replace self replacement only_first
|
||||
|
||||
## ALIAS Get Words
|
||||
|
||||
|
@ -1,16 +0,0 @@
|
||||
from Standard.Base import all
|
||||
|
||||
import project.Any.Any
|
||||
import project.Data.Locale.Locale
|
||||
import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Errors.Illegal_Argument.Illegal_Argument
|
||||
|
||||
## PRIVATE
|
||||
regex_assume_default_locale : Case_Sensitivity -> Any -> Any ! Illegal_Argument
|
||||
regex_assume_default_locale case_sensitivity ~action = case case_sensitivity of
|
||||
Case_Sensitivity.Sensitive -> action
|
||||
Case_Sensitivity.Insensitive locale -> case locale == Locale.default of
|
||||
True -> action
|
||||
False ->
|
||||
msg = "Custom locales are not supported for regexes."
|
||||
Error.throw (Illegal_Argument.Error msg)
|
@ -1,4 +1,5 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Filter_Condition.Filter_Condition
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Extensions
|
||||
@ -143,7 +144,8 @@ type Pattern_2
|
||||
|
||||
Takes an input string and returns all the matches as a `Vector Text`.
|
||||
If the pattern contains marked groups, the values are concatenated
|
||||
together; otherwise the whole match is returned.
|
||||
together; otherwise the whole match is returned. Non-participating
|
||||
groups are omitted.
|
||||
|
||||
Arguments:
|
||||
- input: The text to tokenize.
|
||||
@ -411,7 +413,7 @@ build_tokenization_output_from_match pattern match =
|
||||
if pattern.group_count == 1 then match.text 0 else
|
||||
# Extract the ranges of the spans of all capturing groups
|
||||
group_numbers = 1.up_to pattern.group_count
|
||||
spans = group_numbers.map n-> match.span n
|
||||
spans = group_numbers.map (n-> match.span n) . filter Filter_Condition.Not_Nothing
|
||||
ranges = spans.map span-> case span of Span.Value range _ -> range
|
||||
|
||||
# Eliminate nested capturing groups by sorting and merging the ranges.
|
||||
|
@ -8,7 +8,6 @@ import Standard.Base.Data.Text.Regex_2
|
||||
import Standard.Base.Data.Text.Regex_2.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
|
||||
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
|
||||
import Standard.Base.IO
|
||||
|
||||
from Standard.Base.Data.Text.Regex.Replacer import get_lru_size, replacer_cache_lookup
|
||||
|
||||
@ -18,13 +17,6 @@ import Standard.Test.Extensions
|
||||
polyglot java import org.enso.base.Replacer_Cache
|
||||
|
||||
spec =
|
||||
Test.group "gmt" <|
|
||||
Test.specify "asdf" <|
|
||||
IO.println <| Regex_2.compile 's\u{301}' . replace 'sśs\u{301}' '-'
|
||||
IO.println <| Regex_2.compile 'a\u{301}' . match_all "aááêe xêy"
|
||||
#Regex_2.compile 'a\u{301}+' . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||
Regex_2.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||
|
||||
Test.group "Compile" <|
|
||||
Test.specify "should be able to be compiled" <|
|
||||
pattern = Regex_2.compile "(?<dots>..)" case_insensitive=True
|
||||
@ -177,12 +169,12 @@ spec =
|
||||
pattern.split 'aśsśs\u{301}śb' . should_equal ['aś', 'ś', '\u{301}śb']
|
||||
|
||||
Test.group "Pattern_2.tokenize" <|
|
||||
Test.specify "can tokenize simple regexes without capturing groups"
|
||||
Test.specify "can tokenize with simple regexes without capturing groups"
|
||||
Regex_2.compile "[a-z]+" . tokenize "1-800-regex-yes" . should_equal ["regex", "yes"]
|
||||
Regex_2.compile "[a-z]+" case_insensitive=True . tokenize "1-800-REGEX-YES" . should_equal ["REGEX", "YES"]
|
||||
Regex_2.compile "\d\d" . tokenize "12 hi345 67r890r" . should_equal ["12", "34", "67", "89"]
|
||||
|
||||
Test.specify "can tokenize regexes with capturing groups"
|
||||
Test.specify "can tokenize with regexes with capturing groups"
|
||||
Regex_2.compile "(\d\d)\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "[a-z]+(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
Regex_2.compile "[a-z]+(\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["", "182", "20", ""]
|
||||
@ -190,25 +182,24 @@ spec =
|
||||
Test.specify "ignores non-capturing groups"
|
||||
Regex_2.compile "(?:(\d\d)\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(?<foo>\d\d)(?:\d)" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(?:[a-z]+)(\d+)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
|
||||
Test.specify "ignores nested groups"
|
||||
Regex_2.compile "(\d(\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "(?<foo>\d(?<bar>\d))\d" . tokenize "12 hi345 67r890r" . should_equal ["34", "89"]
|
||||
Regex_2.compile "[a-z]+((\d)\d*)" . tokenize "xy blink182 !!matchbox20 foo" . should_equal ["182", "20"]
|
||||
Regex_2.compile "\d(\d(\d\d)\d)\d" . tokenize "012345678901" . should_equal ["1234", "7890"]
|
||||
|
||||
Test.specify "non-participating groups are rendered as the empty string"
|
||||
Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_4_0" . should_equal ['340']
|
||||
Regex_2.compile "(\d).(?:(\d)|([a-z])).(\d)" . tokenize "3_q_0" . should_equal ['3q0']
|
||||
|
||||
Test.specify "handles unicode" <|
|
||||
Regex_2.compile "[áê]+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||
#fail
|
||||
#Regex_2.compile '[a\u{301}e\u{301}]+' . tokenize 'aááêe xêy' . should_equal ['a\u{301}a\u{301}e\u{301}', 'e\u{301}']
|
||||
#Regex_2.compile '(?:a\u{301})+' . tokenize 'aááêe xêy' . should_equal ['a\u{301}a\u{301}']
|
||||
#Regex_2.compile 'a\u{301}' . tokenize 'aááêe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||
# Wrong
|
||||
# `+` only applies to the accent `\u{301}`, not to the entire grapheme.
|
||||
Regex_2.compile 'a\u{301}+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||
# Fails
|
||||
Regex_2.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}', 'a\u{301}']
|
||||
# fails
|
||||
#Regex_2.compile "a\u{301}+" . tokenize "aááêe xêy" . should_equal ["ááê", "ê"]
|
||||
Regex_2.compile '(?:a\u{301})+' . tokenize 'aa\u{301}a\u{301}êe xêy' . should_equal ['a\u{301}a\u{301}']
|
||||
Regex_2.compile "x([áê]+)y" . tokenize "xáy xêy" . should_equal ["á", "ê"]
|
||||
|
||||
Test.specify "examples are correct" <|
|
||||
|
@ -1395,6 +1395,17 @@ spec =
|
||||
splits.at 1 . should_equal "a"
|
||||
splits.at 2 . should_equal "a"
|
||||
|
||||
Test.group "Regex tokenizing" <|
|
||||
Test.specify "can tokenize with simple regexes without capturing groups"
|
||||
"1-800-regex-yes" . tokenize "[a-z]+" . should_equal ["regex", "yes"]
|
||||
"1-800-REGEX-YES" . tokenize "[a-z]+" case_sensitivity=Case_Sensitivity.Insensitive . should_equal ["REGEX", "YES"]
|
||||
"12 hi345 67r890r" . tokenize "\d\d" . should_equal ["12", "34", "67", "89"]
|
||||
|
||||
Test.specify "examples are correct" <|
|
||||
"ABCDEF" . tokenize "..." . should_equal ["ABC","DEF"]
|
||||
"ABCDEF" . tokenize "(.).(.)" . should_equal ["AC","DF"]
|
||||
'Hello Big\r\nWide\tWorld\nGoodbye!' . tokenize "(\S+)(?:\s+|$)" . should_equal ["Hello","Big","Wide","World","Goodbye!"]
|
||||
|
||||
Test.group "Text.replace" <|
|
||||
Test.specify "should work as in examples" <|
|
||||
'aaa'.replace 'aa' 'b' . should_equal 'ba'
|
||||
|
Loading…
Reference in New Issue
Block a user