mirror of
https://github.com/enso-org/enso.git
synced 2024-11-22 22:10:15 +03:00
Regex.Compile using Truffle Regex, update find, match and match_all (#5785)
This commit is contained in:
parent
91ef8acf35
commit
7887fb8d40
@ -333,6 +333,8 @@
|
||||
and renamed them to `match`, `find`, `find_all` (respectively).][5721]
|
||||
- [Updated `rename_columns` to new API. Added `first_row`, `second_row` and
|
||||
`last_row` to Table types][5719]
|
||||
- [Remove many regex compile flags; separated `match` into `match` and
|
||||
`match_all`.][5785]
|
||||
- [Aligned names of columns created by column operations.][5850]
|
||||
|
||||
[debug-shortcuts]:
|
||||
@ -509,10 +511,11 @@
|
||||
[5699]: https://github.com/enso-org/enso/pull/5699
|
||||
[5719]: https://github.com/enso-org/enso/pull/5719
|
||||
[5721]: https://github.com/enso-org/enso/pull/5721
|
||||
[5779]: https://github.com/enso-org/enso/pull/5779
|
||||
[5757]: https://github.com/enso-org/enso/pull/5757
|
||||
[5802]: https://github.com/enso-org/enso/pull/5802
|
||||
[5774]: https://github.com/enso-org/enso/pull/5774
|
||||
[5779]: https://github.com/enso-org/enso/pull/5779
|
||||
[5785]: https://github.com/enso-org/enso/pull/5785
|
||||
[5802]: https://github.com/enso-org/enso/pull/5802
|
||||
[5850]: https://github.com/enso-org/enso/pull/5850
|
||||
|
||||
#### Enso Compiler
|
||||
|
@ -12,10 +12,11 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity
|
||||
import project.Data.Text.Encoding.Encoding
|
||||
import project.Data.Text.Location
|
||||
import project.Data.Text.Matching_Mode.Matching_Mode
|
||||
import project.Data.Text.Regex
|
||||
import project.Data.Text.Regex.Match.Match
|
||||
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
|
||||
import project.Data.Text.Regex_Matcher.Regex_Matcher
|
||||
import project.Data.Text.Regex_2
|
||||
import project.Data.Text.Regex_2.Regex_Syntax_Error
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
@ -23,7 +24,6 @@ import project.Data.Text.Text_Matcher.Text_Matcher
|
||||
import project.Data.Text.Text_Sub_Range.Codepoint_Ranges
|
||||
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
|
||||
import project.Data.Vector.Vector
|
||||
import project.Error.Common.Compile_Error
|
||||
import project.Error.Common.Index_Out_Of_Bounds
|
||||
import project.Error.Error
|
||||
import project.Error.Encoding_Error.Encoding_Error
|
||||
@ -227,10 +227,10 @@ Text.characters self =
|
||||
example_find_insensitive =
|
||||
## This matches `aBc` @ character 11
|
||||
"aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
|
||||
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Compile_Error
|
||||
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error
|
||||
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
Regex.compile pattern case_insensitive=case_insensitive . match self Matching_Mode.First
|
||||
Regex_2.compile pattern case_insensitive=case_insensitive . match self
|
||||
|
||||
## Finds all the matches of the regular expression `pattern` in `self`,
|
||||
returning a Vector. If not found, will be an empty Vector.
|
||||
@ -249,12 +249,10 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
example_find_all_insensitive =
|
||||
## This matches `aABbbbc` @ character 0 and `aBC` @ character 11
|
||||
"aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
|
||||
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Compile_Error
|
||||
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error
|
||||
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
case Regex.compile pattern case_insensitive=case_insensitive . match self Regex_Mode.All of
|
||||
Nothing -> []
|
||||
matches -> matches
|
||||
Regex_2.compile pattern case_insensitive=case_insensitive . match_all self
|
||||
|
||||
## ALIAS Check Matches
|
||||
|
||||
@ -276,10 +274,10 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
regex = ".+ct@.+"
|
||||
# Evaluates to true
|
||||
"CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
|
||||
Text.match : Text -> Case_Sensitivity -> Boolean ! Compile_Error
|
||||
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error
|
||||
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
|
||||
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
|
||||
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
|
||||
compiled_pattern.matches self
|
||||
|
||||
## ALIAS Split Text
|
||||
|
@ -1,5 +1,6 @@
|
||||
## Internal text utilities for inspecting text primitives.
|
||||
|
||||
import project.Any.Any
|
||||
import project.Data.Text.Text
|
||||
|
||||
## PRIVATE
|
||||
@ -7,3 +8,17 @@ import project.Data.Text.Text
|
||||
Forces flattening of a text value.
|
||||
optimize : Text
|
||||
optimize text = @Builtin_Method "Prim_Text_Helper.optimize"
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Compile the regex using the Truffle regex library.
|
||||
|
||||
Returns a Java RegexObject (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
|
||||
Arguments:
|
||||
- pattern: the regex to compile
|
||||
- options: string containing traditional regex flags (for example, "g"
|
||||
as in "/foo/g"
|
||||
compile_regex : Text -> Text -> Any
|
||||
compile_regex pattern options = @Builtin_Method "Prim_Text_Helper.compile_regex"
|
||||
|
@ -0,0 +1,67 @@
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Text
|
||||
import project.Nothing.Nothing
|
||||
|
||||
type Match_2
|
||||
## internal_regex_result : RegexResult (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (pattern : Pattern_2) (internal_regex_result : Any) (input : Text)
|
||||
|
||||
## PRIVATE
|
||||
Returns the start character of group n.
|
||||
|
||||
Arguments:
|
||||
- n: the integer group number. Note that the groups explicitly
|
||||
defined in the regex are numbered starting at 1; group 0 refers to the
|
||||
entire match range.
|
||||
start : Integer -> Integer
|
||||
start self n = self.internal_regex_result.getStart n
|
||||
|
||||
## PRIVATE
|
||||
Returns the start character of group n
|
||||
|
||||
Arguments:
|
||||
- n: the integer group number
|
||||
end : Integer -> Integer
|
||||
end self n = self.internal_regex_result.getEnd n
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Gets the text matched by the group with the provided identifier, or
|
||||
`Nothing` if the group did not participate in the match. If no such group
|
||||
exists for the provided identifier, a `No_Such_Group` is returned.
|
||||
|
||||
Arguments:
|
||||
- id: The integer index or name of that group.
|
||||
|
||||
? The Full Match
|
||||
The group with index 0 is always the full match of the pattern.
|
||||
|
||||
? Named Groups by Index
|
||||
If the regex contained named groups, these may also be accessed by
|
||||
index based on their position in the pattern.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match_2.group will return Nothing.
|
||||
group : Integer | Text -> Span
|
||||
group self id =
|
||||
n = self.pattern.lookup_group id
|
||||
start = self.start n
|
||||
end = self.end n
|
||||
does_not_participate = start == -1 || end == -1
|
||||
case does_not_participate of
|
||||
True -> Nothing
|
||||
False ->
|
||||
range = Range.new (self.start n) (self.end n)
|
||||
(Utf_16_Span.Value range self.input).to_grapheme_span
|
@ -0,0 +1,177 @@
|
||||
import project.Any.Any
|
||||
#import project.Data.Boolean.Boolean
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
import project.Data.Map.Map
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Range.Extensions
|
||||
import project.Data.Range.Range
|
||||
import project.Data.Text.Span.Span
|
||||
import project.Data.Text.Span.Utf_16_Span
|
||||
import project.Data.Text.Regex.Match_2.Match_2
|
||||
import project.Data.Text.Regex_2.No_Such_Group
|
||||
import project.Data.Text.Text
|
||||
import project.Data.Vector.Vector
|
||||
import project.Error.Error
|
||||
import project.Meta
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
import project.Polyglot
|
||||
|
||||
polyglot java import org.enso.base.Text_Utils
|
||||
|
||||
type Pattern_2
|
||||
## internal_regex_object : RegexObject (Truffle)
|
||||
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
|
||||
Value (internal_regex_object : Any)
|
||||
|
||||
## Returns `True` if the input matches against the pattern described by
|
||||
`self`, otherwise `False`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to check for matching.
|
||||
matches : Text -> Boolean
|
||||
matches self input =
|
||||
m = self.internal_regex_object.exec input 0
|
||||
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match : Text -> Match_2 | Nothing
|
||||
match self input =
|
||||
it = Match_Iterator.new self input
|
||||
case it.next of
|
||||
Match_Iterator_Value.Next _ match _ -> match
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
|
||||
## Tries to match the provided `input` against the pattern `self`.
|
||||
|
||||
Returns a Vector of Match_2 objects.
|
||||
|
||||
Arguments:
|
||||
- input: The text to match the pattern described by `self` against.
|
||||
match_all : Text -> Vector Match_2
|
||||
match_all self input =
|
||||
builder = Vector.new_builder
|
||||
it = Match_Iterator.new self input
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next _ match next_it ->
|
||||
builder.append match
|
||||
go next_it
|
||||
Match_Iterator_Value.Last _ -> Nothing
|
||||
go it
|
||||
builder.to_vector
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Look up a match group name or number, and check that it is valid.
|
||||
|
||||
Arguments:
|
||||
- id: The name or number of the group that was asked for.
|
||||
|
||||
Returns: a group number.
|
||||
|
||||
A group number is invalid if it is outside the range of groups
|
||||
that were in the original pattern.
|
||||
|
||||
A group name is invalid if it was not defined in the original pattern.
|
||||
|
||||
A group name is an alias for a group number; if a name is passed to
|
||||
this method, it returns the corresponding group number.
|
||||
|
||||
Note that it is possible for a group to "not participate in the match",
|
||||
for example with a disjunction. In the example below, the "(d)" group
|
||||
does not participate -- it neither matches nor fails.
|
||||
|
||||
"ab((c)|(d))".find "abc"
|
||||
|
||||
In this case, the group id for "(d)", which is 3, is a valid group id and
|
||||
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
|
||||
Match_2.group will return Nothing.
|
||||
|
||||
lookup_group : Integer | Text -> Integer ! No_Such_Group
|
||||
lookup_group self id =
|
||||
case id of
|
||||
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
|
||||
True -> n
|
||||
False -> Error.throw (No_Such_Group.Error n)
|
||||
name : Text ->
|
||||
# Maps name to number
|
||||
groups = self.internal_regex_object.groups
|
||||
|
||||
n = case groups of
|
||||
# If Nothing, there are no named groups
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
_ ->
|
||||
qq = (read_group_map groups name)
|
||||
case qq of
|
||||
Nothing -> Nothing
|
||||
n : Integer -> n
|
||||
case n of
|
||||
_ : Integer -> n
|
||||
Nothing -> Error.throw (No_Such_Group.Error name)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Performs the regex match, and iterates through the results. Yields both
|
||||
the matched parts of the string, and the 'filler' parts between them.
|
||||
|
||||
At each step, it yields a Match_Iterator_Value, whivch has either a filler
|
||||
and a match, or just the final filler. A Match_Iterator_Value.Last value is
|
||||
return at the end, and only at the end.
|
||||
type Match_Iterator
|
||||
new : Pattern_2 -> Text -> Match_Iterator
|
||||
new pattern input = Match_Iterator.Value pattern input 0
|
||||
|
||||
Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
|
||||
|
||||
next : Match_Iterator_Value
|
||||
next self =
|
||||
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
|
||||
case regex_result.isMatch of
|
||||
False ->
|
||||
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
|
||||
Match_Iterator_Value.Last filler_span
|
||||
True ->
|
||||
match_start = regex_result.getStart 0
|
||||
filler_range = Range.new self.cursor match_start
|
||||
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
|
||||
match = Match_2.Value self.pattern regex_result self.input
|
||||
next_cursor = match.end 0
|
||||
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
|
||||
Match_Iterator_Value.Next filler_span match next_iterator
|
||||
|
||||
to_text_debug : Vector Text
|
||||
to_text_debug self =
|
||||
vb = Vector.new_builder
|
||||
go it = case it.next of
|
||||
Match_Iterator_Value.Next filler match next_it ->
|
||||
vb.append ('\"' + filler.text + '\"')
|
||||
vb.append ("/" + (match.span 0).text + "/")
|
||||
go next_it
|
||||
Match_Iterator_Value.Last filler ->
|
||||
vb.append ('\"' + filler.text + '\"')
|
||||
go self
|
||||
vb.to_vector
|
||||
|
||||
## PRIVATE
|
||||
type Match_Iterator_Value
|
||||
Next (filler : Span) (match : Match_2) (next_iterator : Match_Iterator)
|
||||
Last (filler : Span)
|
||||
|
||||
## PRIVATE
|
||||
group_map_contains : Any -> Text -> Boolean
|
||||
group_map_contains map elem =
|
||||
members = Polyglot.get_members map
|
||||
as_vec = Vector.from_polyglot_array members
|
||||
as_vec.contains elem
|
||||
|
||||
## PRIVATE
|
||||
read_group_map : Any -> Text -> Integer | Nothing
|
||||
read_group_map map name =
|
||||
case group_map_contains map name of
|
||||
True -> Polyglot.get_member map name
|
||||
False -> Nothing
|
||||
|
@ -0,0 +1,82 @@
|
||||
import project.Any.Any
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Text.Prim_Text_Helper
|
||||
import project.Data.Text.Regex.Pattern_2.Pattern_2
|
||||
import project.Data.Text.Text
|
||||
import project.Error.Error
|
||||
import project.Error.Illegal_Argument.Illegal_Argument
|
||||
import project.Nothing.Nothing
|
||||
import project.Panic.Panic
|
||||
from project.Data.Boolean import Boolean, True, False
|
||||
from project.Error.Common import Syntax_Error
|
||||
|
||||
polyglot java import java.util.regex.Pattern as Java_Pattern
|
||||
|
||||
## Compile the provided `expression` into a regex pattern that can be used for
|
||||
matching.
|
||||
|
||||
Arguments
|
||||
- expression: The text representing the regular expression that you want to
|
||||
compile.
|
||||
- case_insensitive: Enables or disables case-insensitive matching. Case
|
||||
insensitive matching behaves as if it normalises the case of all input
|
||||
text before matching on it.
|
||||
|
||||
? Why Compile?
|
||||
While many regex engines are able to cache ad-hoc patterns, it is often
|
||||
useful to be able to manually retain a pattern that you have computed. This
|
||||
function exists so you can hold onto the resultant `Pattern_2` object,
|
||||
instead of immediately proceeding to match using it.
|
||||
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error
|
||||
compile self expression case_insensitive=Nothing =
|
||||
options_string = if case_insensitive == True then "usgi" else "usg"
|
||||
|
||||
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
|
||||
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
|
||||
|
||||
Pattern_2.Value internal_regex_object
|
||||
|
||||
## ADVANCED
|
||||
|
||||
Escape the special characters in `expression` such that the result is a
|
||||
valid literal pattern for the original string.
|
||||
|
||||
Arguments:
|
||||
- expression: The expression to escape metacharacters in.
|
||||
|
||||
> Example
|
||||
Turn a Text into a regex that matches that string exactly.
|
||||
|
||||
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
|
||||
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
|
||||
|
||||
example_escape =
|
||||
literal_string = "\!\.|abcde"
|
||||
engine = Default_Engine.new
|
||||
engine.escape literal_string
|
||||
escape : Text -> Text
|
||||
escape self expression = Java_Pattern.quote expression
|
||||
|
||||
## An error that is emitted when there is no such group in the match for the
|
||||
provided `id`.
|
||||
|
||||
Arguments:
|
||||
- id: The identifier of the group that was asked for but does not exist.
|
||||
type No_Such_Group
|
||||
Error (id : Text | Integer)
|
||||
|
||||
## PRIVATE
|
||||
|
||||
Provides a human-readable representation of the `No_Such_Group`.
|
||||
to_display_text : Text
|
||||
to_display_text self = case self.id of
|
||||
_ : Integer -> "No group exists with the index " + self.id.to_text + "."
|
||||
_ : Text -> "No group exists with the name " + self.id + "."
|
||||
|
||||
## A syntax error reported by the Truffle regex compiler.
|
||||
type Regex_Syntax_Error
|
||||
## PRIVATE
|
||||
|
||||
Arguments:
|
||||
- message: A description of the erroneous syntax.
|
||||
Error message
|
@ -7,7 +7,7 @@
|
||||
|
||||
example_span =
|
||||
text = "Hello!"
|
||||
Span.Value 0 3 text
|
||||
Span.Value (Range.new 0 3) text
|
||||
|
||||
import project.Data.Numbers.Integer
|
||||
import project.Data.Pair.Pair
|
||||
|
@ -0,0 +1,70 @@
|
||||
package org.enso.interpreter.node.expression.builtin.text;
|
||||
|
||||
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
|
||||
import com.oracle.truffle.api.dsl.Cached;
|
||||
import com.oracle.truffle.api.dsl.Specialization;
|
||||
import com.oracle.truffle.api.exception.AbstractTruffleException;
|
||||
import com.oracle.truffle.api.nodes.Node;
|
||||
import com.oracle.truffle.api.source.Source;
|
||||
|
||||
import org.enso.interpreter.dsl.BuiltinMethod;
|
||||
import org.enso.interpreter.runtime.builtin.Builtins;
|
||||
import org.enso.interpreter.runtime.callable.atom.Atom;
|
||||
import org.enso.interpreter.runtime.data.text.Text;
|
||||
import org.enso.interpreter.runtime.EnsoContext;
|
||||
import org.enso.interpreter.runtime.error.PanicException;
|
||||
|
||||
@BuiltinMethod(
|
||||
type = "Prim_Text_Helper",
|
||||
name = "compile_regex",
|
||||
description = "Compiles a regexp.",
|
||||
autoRegister = false)
|
||||
public abstract class RegexCompileNode extends Node {
|
||||
static RegexCompileNode build() {
|
||||
return RegexCompileNodeGen.create();
|
||||
}
|
||||
|
||||
abstract Object execute(Object pattern, Object options);
|
||||
|
||||
@Specialization(
|
||||
limit = "3",
|
||||
guards = {
|
||||
"pattern.toString().equals(cachedPattern)",
|
||||
"options.toString().equals(cachedOptions)"
|
||||
})
|
||||
Object parseRegexPattern(
|
||||
Text pattern,
|
||||
Text options,
|
||||
@Cached("pattern.toString()") String cachedPattern,
|
||||
@Cached("options.toString()") String cachedOptions,
|
||||
@Cached("compile(cachedPattern, cachedOptions)") Object regex) {
|
||||
return regex;
|
||||
}
|
||||
|
||||
@Specialization
|
||||
Object alwaysCompile(Text pattern, Text options) {
|
||||
return compile(pattern.toString(), options.toString());
|
||||
}
|
||||
|
||||
@TruffleBoundary
|
||||
Object compile(String pattern, String options) {
|
||||
var ctx = EnsoContext.get(this);
|
||||
var env = ctx.getEnvironment();
|
||||
var s = "Flavor=ECMAScript/" + pattern + "/" + options;
|
||||
var src =
|
||||
Source.newBuilder("regex", s, "myRegex")
|
||||
.mimeType("application/tregex")
|
||||
.internal(true)
|
||||
.build();
|
||||
|
||||
try {
|
||||
var regex = env.parseInternal(src).call();
|
||||
return regex;
|
||||
} catch (AbstractTruffleException e) {
|
||||
Builtins builtins = ctx.getBuiltins();
|
||||
String msg = "Regex parse error: " + e.getMessage();
|
||||
Atom err = builtins.error().makeSyntaxError(msg);
|
||||
throw new PanicException(err, this);
|
||||
}
|
||||
}
|
||||
}
|
@ -166,6 +166,8 @@ public final class ParseStdLibTest extends TestCase {
|
||||
"Data/Range.enso",
|
||||
"Data/Sort_Column_Selector.enso",
|
||||
"Data/Text/Extensions.enso",
|
||||
"Data/Text/Regex/Match_2.enso",
|
||||
"Data/Text/Regex/Pattern_2.enso",
|
||||
"Data/Text/Regex/Regex_Mode.enso",
|
||||
"Data/Value_Type.enso",
|
||||
"Data/Vector.enso",
|
||||
|
@ -1,4 +1,6 @@
|
||||
from Standard.Base import all
|
||||
import Standard.Base.Data.Text.Regex_2.No_Such_Group
|
||||
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
|
||||
import Standard.Base.Data.Text.Span.Span
|
||||
import Standard.Base.Error.Common.Index_Out_Of_Bounds
|
||||
import Standard.Base.Error.Common.Type_Error
|
||||
@ -57,13 +59,17 @@ type Manual
|
||||
- Note that currently the regex-based operations may not handle the edge
|
||||
cases described above too well.
|
||||
spec =
|
||||
check_span result span = result.span 0 . to_grapheme_span . should_equal span
|
||||
check_span_all result spans = result . map (m-> (m.span 0).to_grapheme_span) . should_equal spans
|
||||
check_span match span =
|
||||
match . should_not_equal Nothing
|
||||
match.group 0 . should_equal span
|
||||
check_span_all match spans = match . map (m-> (m.group 0)) . should_equal spans
|
||||
|
||||
accent_1 = '\u00E9'
|
||||
accent_2 = '\u0065\u{301}'
|
||||
|
||||
Test.group "Text" <|
|
||||
kshi = '\u0915\u094D\u0937\u093F'
|
||||
facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
|
||||
accent_1 = '\u00E9'
|
||||
accent_2 = '\u0065\u{301}'
|
||||
utf_8_whitespace = 'foo\n bar baz \u202F quux'
|
||||
utf_8_whitespace_split = ["foo", "bar", "baz", "quux"]
|
||||
sentences = '''
|
||||
@ -1186,7 +1192,25 @@ spec =
|
||||
## Regex matching does not do case folding
|
||||
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
|
||||
|
||||
## But it should handle the Unicode normalization
|
||||
Test.specify "should handle accents and other multi-point graphemes" <|
|
||||
accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
|
||||
|
||||
# Check spans
|
||||
check_span ((accents).find 'h') (Span.Value (3.up_to 4) accents)
|
||||
check_span ((accents).find 'e\u{301}') (Span.Value (1.up_to 2) accents)
|
||||
check_span_all ((accents).find_all 'h') [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
|
||||
check_span_all ((accents).find_all 'e\u{301}') [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
|
||||
|
||||
# Check contents to make sure the spans' ranges are ok
|
||||
accents.find 'h' . group 0 . text . should_equal 'h'
|
||||
accents.find 'e\u{301}' . group 0 . text . should_equal 'e\u{301}'
|
||||
|
||||
Test.specify "should handle the Unicode normalization" pending="TODO: Rewrite this to use normalization methods explicitly" <|
|
||||
## This test passed for the builtin Java regex library, using
|
||||
Pattern.CANON_EQ, but since that option is buggy and rarely use,
|
||||
we won't attempt to recreate it with Truffle regex. Instead,
|
||||
expose normalization methods to allow developers to do it
|
||||
themselves.
|
||||
accents = 'a\u{301}e\u{301}o\u{301}'
|
||||
check_span (accents.find accent_1) (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
|
||||
|
||||
@ -1208,12 +1232,6 @@ spec =
|
||||
|
||||
"aaa aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (5.up_to 7) "aaa aaa")
|
||||
|
||||
Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
|
||||
check_span_all ("abacadae".find_all "a[bc]") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
|
||||
check_span_all ("abacadae".find_all "a.") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
check_span_all ("abacadae".find_all "a.*") [Span.Value (0.up_to 8) "abacadae"]
|
||||
check_span_all ("abacadae".find_all "a.+?") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
|
||||
Test.specify "should default to exact matching for locate but regex for match" <|
|
||||
txt = "aba[bc]adacae"
|
||||
"ab".locate "ab" . should_equal (Span.Value (0.up_to 2) "ab")
|
||||
@ -1230,7 +1248,7 @@ spec =
|
||||
check_span (txt.find "a[bc]") (Span.Value (0.up_to 2) "aba[bc]adacae")
|
||||
check_span_all (txt.find_all "a[bc]") [Span.Value (0.up_to 2) "aba[bc]adacae", Span.Value (9.up_to 11) "aba[bc]adacae"]
|
||||
|
||||
Test.group "Regex matching" <|
|
||||
Test.group "Regex: find and find_all" <|
|
||||
Test.specify "should be possible on text" <|
|
||||
match = "My Text: Goes Here".find "^My Text: (.+)$"
|
||||
check_span match (Span.Value (0.up_to 18) "My Text: Goes Here")
|
||||
@ -1245,6 +1263,80 @@ spec =
|
||||
match = "MY".find "my" Case_Sensitivity.Insensitive
|
||||
check_span match (Span.Value (0.up_to 2) "MY")
|
||||
|
||||
Test.specify "should allow regexes in match" <|
|
||||
hello = "Hello World!"
|
||||
|
||||
check_span (hello.find ".o" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 5) "Hello World!")
|
||||
check_span_all (hello.find_all ".o") [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
|
||||
|
||||
check_span ("foobar".find "BAR" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 6) "foobar")
|
||||
|
||||
## Regex matching does not do case folding
|
||||
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
|
||||
|
||||
Test.specify "should allow access to match groups by number" <|
|
||||
"abcddd".find "ab(c(d+))" . group 0 . text . should_equal "abcddd"
|
||||
"abcddd".find "ab(c(d+))" . group 1 . text . should_equal "cddd"
|
||||
"abcddd".find "ab(c(d+))" . group 2 . text . should_equal "ddd"
|
||||
|
||||
Test.specify "should allow access to match groups by name" <|
|
||||
"abcddd".find "ab(?<cee>c(d+))" . group "cee" . text . should_equal "cddd"
|
||||
|
||||
Test.specify "should throw No_Such_Group for an out-of-range group number" <|
|
||||
"abcddd".find "ab(c(d+))" . group 3 . text . should_fail_with No_Such_Group
|
||||
"abcddd".find "ab(c(d+))" . group 12 . text . should_fail_with No_Such_Group
|
||||
"abcddd".find "ab(c(d+))" . group (-1) . text . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should throw No_Such_Group for an invalid group name" <|
|
||||
"abcddd".find "ab(?<cee>c(d+))" . group "dee" . text . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should throw No_Such_Group for an invalid group name (when there are no named groups at all)" <|
|
||||
"abcddd".find "ab(c(d+))" . group "dee" . text . should_fail_with No_Such_Group
|
||||
|
||||
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
|
||||
"abcddd".find "ab(c(((((((" . group 0 . text . should_fail_with Regex_Syntax_Error
|
||||
|
||||
Test.specify ".group should return Nothing if the group did not participate in the match" <|
|
||||
match_c = "abc".find "ab((c)|(d))"
|
||||
match_c.group 1 . text . should_equal "c"
|
||||
match_c.group 2 . text . should_equal "c"
|
||||
match_c.group 3 . should_equal Nothing
|
||||
match_d = "abd".find "ab((c)|(d))"
|
||||
match_d.group 1 . text . should_equal "d"
|
||||
match_d.group 2 . should_equal Nothing
|
||||
match_d.group 3 . text . should_equal "d"
|
||||
|
||||
Test.specify "should handle accents and other multi-point graphemes" <|
|
||||
accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
|
||||
|
||||
# Check spans
|
||||
check_span ((accents).find 'h') (Span.Value (3.up_to 4) accents)
|
||||
check_span ((accents).find 'e\u{301}') (Span.Value (1.up_to 2) accents)
|
||||
check_span_all ((accents).find_all 'h') [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
|
||||
check_span_all ((accents).find_all 'e\u{301}') [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
|
||||
|
||||
# Check contents to make sure the spans' ranges are ok
|
||||
accents.find 'h' . group 0 . text . should_equal 'h'
|
||||
accents.find 'e\u{301}' . group 0 . text . should_equal 'e\u{301}'
|
||||
|
||||
Test.specify "should expand a partial-grapheme match to the whole grapheme" <|
|
||||
'e\u{301}'.find '\u{301}' . group 0 . text . should_equal 'e\u{301}'
|
||||
|
||||
Test.specify "should handle the Unicode normalization" pending="Use this to test exposed normalization methods" <|
|
||||
## This test passed for the builtin Java regex library, using
|
||||
Pattern.CANON_EQ, but since that option is buggy and rarely use,
|
||||
we won't attempt to recreate it with Truffle regex. Instead,
|
||||
expose normalization methods to allow developers to do it
|
||||
themselves.
|
||||
accents = 'a\u{301}e\u{301}o\u{301}'
|
||||
check_span (accents.find accent_1) (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
|
||||
|
||||
Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
|
||||
check_span_all ("abacadae".find_all "a[bc]") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
|
||||
check_span_all ("abacadae".find_all "a.") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
check_span_all ("abacadae".find_all "a.*") [Span.Value (0.up_to 8) "abacadae"]
|
||||
check_span_all ("abacadae".find_all "a.+?") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
|
||||
|
||||
Test.group "Text.match" <|
|
||||
Test.specify "should default to regex" <|
|
||||
"My Text: Goes Here".match "^My Text: (.+)$" . should_be_true
|
||||
@ -1254,6 +1346,8 @@ spec =
|
||||
|
||||
Test.specify "should only match whole input" <|
|
||||
"Hello".match "[a-z]" . should_be_false
|
||||
"abcd".match "bcd" . should_be_false
|
||||
"abcd".match "abc" . should_be_false
|
||||
"x".match "[a-z]" . should_be_true
|
||||
|
||||
Test.specify "should be possible on unicode text" <|
|
||||
|
Loading…
Reference in New Issue
Block a user