Regex.Compile using Truffle Regex, update find, match and match_all (#5785)

This commit is contained in:
James Dunkerley 2023-03-10 21:49:50 +00:00 committed by GitHub
parent 91ef8acf35
commit 7887fb8d40
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 533 additions and 25 deletions

View File

@ -333,6 +333,8 @@
and renamed them to `match`, `find`, `find_all` (respectively).][5721]
- [Updated `rename_columns` to new API. Added `first_row`, `second_row` and
`last_row` to Table types][5719]
- [Remove many regex compile flags; separated `match` into `match` and
`match_all`.][5785]
- [Aligned names of columns created by column operations.][5850]
[debug-shortcuts]:
@ -509,10 +511,11 @@
[5699]: https://github.com/enso-org/enso/pull/5699
[5719]: https://github.com/enso-org/enso/pull/5719
[5721]: https://github.com/enso-org/enso/pull/5721
[5779]: https://github.com/enso-org/enso/pull/5779
[5757]: https://github.com/enso-org/enso/pull/5757
[5802]: https://github.com/enso-org/enso/pull/5802
[5774]: https://github.com/enso-org/enso/pull/5774
[5779]: https://github.com/enso-org/enso/pull/5779
[5785]: https://github.com/enso-org/enso/pull/5785
[5802]: https://github.com/enso-org/enso/pull/5802
[5850]: https://github.com/enso-org/enso/pull/5850
#### Enso Compiler

View File

@ -12,10 +12,11 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity
import project.Data.Text.Encoding.Encoding
import project.Data.Text.Location
import project.Data.Text.Matching_Mode.Matching_Mode
import project.Data.Text.Regex
import project.Data.Text.Regex.Match.Match
import project.Data.Text.Regex.Regex_Mode.Regex_Mode
import project.Data.Text.Regex_Matcher.Regex_Matcher
import project.Data.Text.Regex_2
import project.Data.Text.Regex_2.Regex_Syntax_Error
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
@ -23,7 +24,6 @@ import project.Data.Text.Text_Matcher.Text_Matcher
import project.Data.Text.Text_Sub_Range.Codepoint_Ranges
import project.Data.Text.Text_Sub_Range.Text_Sub_Range
import project.Data.Vector.Vector
import project.Error.Common.Compile_Error
import project.Error.Common.Index_Out_Of_Bounds
import project.Error.Error
import project.Error.Encoding_Error.Encoding_Error
@ -227,10 +227,10 @@ Text.characters self =
example_find_insensitive =
## This matches `aBc` @ character 11
"aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Compile_Error
Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error
Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
Regex.compile pattern case_insensitive=case_insensitive . match self Matching_Mode.First
Regex_2.compile pattern case_insensitive=case_insensitive . match self
## Finds all the matches of the regular expression `pattern` in `self`,
returning a Vector. If not found, will be an empty Vector.
@ -249,12 +249,10 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
example_find_all_insensitive =
## This matches `aABbbbc` @ character 0 and `aBC` @ character 11
"aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Compile_Error
Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error
Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
case Regex.compile pattern case_insensitive=case_insensitive . match self Regex_Mode.All of
Nothing -> []
matches -> matches
Regex_2.compile pattern case_insensitive=case_insensitive . match_all self
## ALIAS Check Matches
@ -276,10 +274,10 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
regex = ".+ct@.+"
# Evaluates to true
"CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
Text.match : Text -> Case_Sensitivity -> Boolean ! Compile_Error
Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error
Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
case_insensitive = case_sensitivity.is_case_insensitive_in_memory
compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
compiled_pattern.matches self
## ALIAS Split Text

View File

@ -1,5 +1,6 @@
## Internal text utilities for inspecting text primitives.
import project.Any.Any
import project.Data.Text.Text
## PRIVATE
@ -7,3 +8,17 @@ import project.Data.Text.Text
Forces flattening of a text value.
optimize : Text
optimize text = @Builtin_Method "Prim_Text_Helper.optimize"
## PRIVATE
Compile the regex using the Truffle regex library.
Returns a Java RegexObject (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Arguments:
- pattern: the regex to compile
- options: string containing traditional regex flags (for example, "g"
as in "/foo/g"
compile_regex : Text -> Text -> Any
compile_regex pattern options = @Builtin_Method "Prim_Text_Helper.compile_regex"

View File

@ -0,0 +1,67 @@
from project.Data.Boolean import Boolean, True, False
import project.Data.Numbers.Integer
import project.Data.Range.Range
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Text
import project.Nothing.Nothing
type Match_2
## internal_regex_result : RegexResult (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (pattern : Pattern_2) (internal_regex_result : Any) (input : Text)
## PRIVATE
Returns the start character of group n.
Arguments:
- n: the integer group number. Note that the groups explicitly
defined in the regex are numbered starting at 1; group 0 refers to the
entire match range.
start : Integer -> Integer
start self n = self.internal_regex_result.getStart n
## PRIVATE
Returns the start character of group n
Arguments:
- n: the integer group number
end : Integer -> Integer
end self n = self.internal_regex_result.getEnd n
## PRIVATE
Gets the text matched by the group with the provided identifier, or
`Nothing` if the group did not participate in the match. If no such group
exists for the provided identifier, a `No_Such_Group` is returned.
Arguments:
- id: The integer index or name of that group.
? The Full Match
The group with index 0 is always the full match of the pattern.
? Named Groups by Index
If the regex contained named groups, these may also be accessed by
index based on their position in the pattern.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
Match_2.group will return Nothing.
group : Integer | Text -> Span
group self id =
n = self.pattern.lookup_group id
start = self.start n
end = self.end n
does_not_participate = start == -1 || end == -1
case does_not_participate of
True -> Nothing
False ->
range = Range.new (self.start n) (self.end n)
(Utf_16_Span.Value range self.input).to_grapheme_span

View File

@ -0,0 +1,177 @@
import project.Any.Any
#import project.Data.Boolean.Boolean
from project.Data.Boolean import Boolean, True, False
import project.Data.Map.Map
import project.Data.Numbers.Integer
import project.Data.Range.Extensions
import project.Data.Range.Range
import project.Data.Text.Span.Span
import project.Data.Text.Span.Utf_16_Span
import project.Data.Text.Regex.Match_2.Match_2
import project.Data.Text.Regex_2.No_Such_Group
import project.Data.Text.Text
import project.Data.Vector.Vector
import project.Error.Error
import project.Meta
import project.Nothing.Nothing
import project.Panic.Panic
import project.Polyglot
polyglot java import org.enso.base.Text_Utils
type Pattern_2
## internal_regex_object : RegexObject (Truffle)
(See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
Value (internal_regex_object : Any)
## Returns `True` if the input matches against the pattern described by
`self`, otherwise `False`.
Arguments:
- input: The text to check for matching.
matches : Text -> Boolean
matches self input =
m = self.internal_regex_object.exec input 0
m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
## Tries to match the provided `input` against the pattern `self`.
Arguments:
- input: The text to match the pattern described by `self` against.
match : Text -> Match_2 | Nothing
match self input =
it = Match_Iterator.new self input
case it.next of
Match_Iterator_Value.Next _ match _ -> match
Match_Iterator_Value.Last _ -> Nothing
## Tries to match the provided `input` against the pattern `self`.
Returns a Vector of Match_2 objects.
Arguments:
- input: The text to match the pattern described by `self` against.
match_all : Text -> Vector Match_2
match_all self input =
builder = Vector.new_builder
it = Match_Iterator.new self input
go it = case it.next of
Match_Iterator_Value.Next _ match next_it ->
builder.append match
go next_it
Match_Iterator_Value.Last _ -> Nothing
go it
builder.to_vector
## PRIVATE
Look up a match group name or number, and check that it is valid.
Arguments:
- id: The name or number of the group that was asked for.
Returns: a group number.
A group number is invalid if it is outside the range of groups
that were in the original pattern.
A group name is invalid if it was not defined in the original pattern.
A group name is an alias for a group number; if a name is passed to
this method, it returns the corresponding group number.
Note that it is possible for a group to "not participate in the match",
for example with a disjunction. In the example below, the "(d)" group
does not participate -- it neither matches nor fails.
"ab((c)|(d))".find "abc"
In this case, the group id for "(d)", which is 3, is a valid group id and
(Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
Match_2.group will return Nothing.
lookup_group : Integer | Text -> Integer ! No_Such_Group
lookup_group self id =
case id of
n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
True -> n
False -> Error.throw (No_Such_Group.Error n)
name : Text ->
# Maps name to number
groups = self.internal_regex_object.groups
n = case groups of
# If Nothing, there are no named groups
Nothing -> Error.throw (No_Such_Group.Error name)
_ ->
qq = (read_group_map groups name)
case qq of
Nothing -> Nothing
n : Integer -> n
case n of
_ : Integer -> n
Nothing -> Error.throw (No_Such_Group.Error name)
## PRIVATE
Performs the regex match, and iterates through the results. Yields both
the matched parts of the string, and the 'filler' parts between them.
At each step, it yields a Match_Iterator_Value, whivch has either a filler
and a match, or just the final filler. A Match_Iterator_Value.Last value is
return at the end, and only at the end.
type Match_Iterator
new : Pattern_2 -> Text -> Match_Iterator
new pattern input = Match_Iterator.Value pattern input 0
Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
next : Match_Iterator_Value
next self =
regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
case regex_result.isMatch of
False ->
filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
Match_Iterator_Value.Last filler_span
True ->
match_start = regex_result.getStart 0
filler_range = Range.new self.cursor match_start
filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
match = Match_2.Value self.pattern regex_result self.input
next_cursor = match.end 0
next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
Match_Iterator_Value.Next filler_span match next_iterator
to_text_debug : Vector Text
to_text_debug self =
vb = Vector.new_builder
go it = case it.next of
Match_Iterator_Value.Next filler match next_it ->
vb.append ('\"' + filler.text + '\"')
vb.append ("/" + (match.span 0).text + "/")
go next_it
Match_Iterator_Value.Last filler ->
vb.append ('\"' + filler.text + '\"')
go self
vb.to_vector
## PRIVATE
type Match_Iterator_Value
Next (filler : Span) (match : Match_2) (next_iterator : Match_Iterator)
Last (filler : Span)
## PRIVATE
group_map_contains : Any -> Text -> Boolean
group_map_contains map elem =
members = Polyglot.get_members map
as_vec = Vector.from_polyglot_array members
as_vec.contains elem
## PRIVATE
read_group_map : Any -> Text -> Integer | Nothing
read_group_map map name =
case group_map_contains map name of
True -> Polyglot.get_member map name
False -> Nothing

View File

@ -0,0 +1,82 @@
import project.Any.Any
import project.Data.Numbers.Integer
import project.Data.Text.Prim_Text_Helper
import project.Data.Text.Regex.Pattern_2.Pattern_2
import project.Data.Text.Text
import project.Error.Error
import project.Error.Illegal_Argument.Illegal_Argument
import project.Nothing.Nothing
import project.Panic.Panic
from project.Data.Boolean import Boolean, True, False
from project.Error.Common import Syntax_Error
polyglot java import java.util.regex.Pattern as Java_Pattern
## Compile the provided `expression` into a regex pattern that can be used for
matching.
Arguments
- expression: The text representing the regular expression that you want to
compile.
- case_insensitive: Enables or disables case-insensitive matching. Case
insensitive matching behaves as if it normalises the case of all input
text before matching on it.
? Why Compile?
While many regex engines are able to cache ad-hoc patterns, it is often
useful to be able to manually retain a pattern that you have computed. This
function exists so you can hold onto the resultant `Pattern_2` object,
instead of immediately proceeding to match using it.
compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error
compile self expression case_insensitive=Nothing =
options_string = if case_insensitive == True then "usgi" else "usg"
internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
Pattern_2.Value internal_regex_object
## ADVANCED
Escape the special characters in `expression` such that the result is a
valid literal pattern for the original string.
Arguments:
- expression: The expression to escape metacharacters in.
> Example
Turn a Text into a regex that matches that string exactly.
import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
example_escape =
literal_string = "\!\.|abcde"
engine = Default_Engine.new
engine.escape literal_string
escape : Text -> Text
escape self expression = Java_Pattern.quote expression
## An error that is emitted when there is no such group in the match for the
provided `id`.
Arguments:
- id: The identifier of the group that was asked for but does not exist.
type No_Such_Group
Error (id : Text | Integer)
## PRIVATE
Provides a human-readable representation of the `No_Such_Group`.
to_display_text : Text
to_display_text self = case self.id of
_ : Integer -> "No group exists with the index " + self.id.to_text + "."
_ : Text -> "No group exists with the name " + self.id + "."
## A syntax error reported by the Truffle regex compiler.
type Regex_Syntax_Error
## PRIVATE
Arguments:
- message: A description of the erroneous syntax.
Error message

View File

@ -7,7 +7,7 @@
example_span =
text = "Hello!"
Span.Value 0 3 text
Span.Value (Range.new 0 3) text
import project.Data.Numbers.Integer
import project.Data.Pair.Pair

View File

@ -0,0 +1,70 @@
package org.enso.interpreter.node.expression.builtin.text;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
import com.oracle.truffle.api.dsl.Cached;
import com.oracle.truffle.api.dsl.Specialization;
import com.oracle.truffle.api.exception.AbstractTruffleException;
import com.oracle.truffle.api.nodes.Node;
import com.oracle.truffle.api.source.Source;
import org.enso.interpreter.dsl.BuiltinMethod;
import org.enso.interpreter.runtime.builtin.Builtins;
import org.enso.interpreter.runtime.callable.atom.Atom;
import org.enso.interpreter.runtime.data.text.Text;
import org.enso.interpreter.runtime.EnsoContext;
import org.enso.interpreter.runtime.error.PanicException;
@BuiltinMethod(
type = "Prim_Text_Helper",
name = "compile_regex",
description = "Compiles a regexp.",
autoRegister = false)
public abstract class RegexCompileNode extends Node {
static RegexCompileNode build() {
return RegexCompileNodeGen.create();
}
abstract Object execute(Object pattern, Object options);
@Specialization(
limit = "3",
guards = {
"pattern.toString().equals(cachedPattern)",
"options.toString().equals(cachedOptions)"
})
Object parseRegexPattern(
Text pattern,
Text options,
@Cached("pattern.toString()") String cachedPattern,
@Cached("options.toString()") String cachedOptions,
@Cached("compile(cachedPattern, cachedOptions)") Object regex) {
return regex;
}
@Specialization
Object alwaysCompile(Text pattern, Text options) {
return compile(pattern.toString(), options.toString());
}
@TruffleBoundary
Object compile(String pattern, String options) {
var ctx = EnsoContext.get(this);
var env = ctx.getEnvironment();
var s = "Flavor=ECMAScript/" + pattern + "/" + options;
var src =
Source.newBuilder("regex", s, "myRegex")
.mimeType("application/tregex")
.internal(true)
.build();
try {
var regex = env.parseInternal(src).call();
return regex;
} catch (AbstractTruffleException e) {
Builtins builtins = ctx.getBuiltins();
String msg = "Regex parse error: " + e.getMessage();
Atom err = builtins.error().makeSyntaxError(msg);
throw new PanicException(err, this);
}
}
}

View File

@ -166,6 +166,8 @@ public final class ParseStdLibTest extends TestCase {
"Data/Range.enso",
"Data/Sort_Column_Selector.enso",
"Data/Text/Extensions.enso",
"Data/Text/Regex/Match_2.enso",
"Data/Text/Regex/Pattern_2.enso",
"Data/Text/Regex/Regex_Mode.enso",
"Data/Value_Type.enso",
"Data/Vector.enso",

View File

@ -1,4 +1,6 @@
from Standard.Base import all
import Standard.Base.Data.Text.Regex_2.No_Such_Group
import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
import Standard.Base.Data.Text.Span.Span
import Standard.Base.Error.Common.Index_Out_Of_Bounds
import Standard.Base.Error.Common.Type_Error
@ -57,13 +59,17 @@ type Manual
- Note that currently the regex-based operations may not handle the edge
cases described above too well.
spec =
check_span result span = result.span 0 . to_grapheme_span . should_equal span
check_span_all result spans = result . map (m-> (m.span 0).to_grapheme_span) . should_equal spans
check_span match span =
match . should_not_equal Nothing
match.group 0 . should_equal span
check_span_all match spans = match . map (m-> (m.group 0)) . should_equal spans
accent_1 = '\u00E9'
accent_2 = '\u0065\u{301}'
Test.group "Text" <|
kshi = '\u0915\u094D\u0937\u093F'
facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
accent_1 = '\u00E9'
accent_2 = '\u0065\u{301}'
utf_8_whitespace = 'foo\n bar baz \u202F quux'
utf_8_whitespace_split = ["foo", "bar", "baz", "quux"]
sentences = '''
@ -1186,7 +1192,25 @@ spec =
## Regex matching does not do case folding
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
## But it should handle the Unicode normalization
Test.specify "should handle accents and other multi-point graphemes" <|
accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
# Check spans
check_span ((accents).find 'h') (Span.Value (3.up_to 4) accents)
check_span ((accents).find 'e\u{301}') (Span.Value (1.up_to 2) accents)
check_span_all ((accents).find_all 'h') [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
check_span_all ((accents).find_all 'e\u{301}') [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
# Check contents to make sure the spans' ranges are ok
accents.find 'h' . group 0 . text . should_equal 'h'
accents.find 'e\u{301}' . group 0 . text . should_equal 'e\u{301}'
Test.specify "should handle the Unicode normalization" pending="TODO: Rewrite this to use normalization methods explicitly" <|
## This test passed for the builtin Java regex library, using
Pattern.CANON_EQ, but since that option is buggy and rarely use,
we won't attempt to recreate it with Truffle regex. Instead,
expose normalization methods to allow developers to do it
themselves.
accents = 'a\u{301}e\u{301}o\u{301}'
check_span (accents.find accent_1) (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
@ -1208,12 +1232,6 @@ spec =
"aaa aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (5.up_to 7) "aaa aaa")
Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
check_span_all ("abacadae".find_all "a[bc]") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
check_span_all ("abacadae".find_all "a.") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
check_span_all ("abacadae".find_all "a.*") [Span.Value (0.up_to 8) "abacadae"]
check_span_all ("abacadae".find_all "a.+?") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
Test.specify "should default to exact matching for locate but regex for match" <|
txt = "aba[bc]adacae"
"ab".locate "ab" . should_equal (Span.Value (0.up_to 2) "ab")
@ -1230,7 +1248,7 @@ spec =
check_span (txt.find "a[bc]") (Span.Value (0.up_to 2) "aba[bc]adacae")
check_span_all (txt.find_all "a[bc]") [Span.Value (0.up_to 2) "aba[bc]adacae", Span.Value (9.up_to 11) "aba[bc]adacae"]
Test.group "Regex matching" <|
Test.group "Regex: find and find_all" <|
Test.specify "should be possible on text" <|
match = "My Text: Goes Here".find "^My Text: (.+)$"
check_span match (Span.Value (0.up_to 18) "My Text: Goes Here")
@ -1245,6 +1263,80 @@ spec =
match = "MY".find "my" Case_Sensitivity.Insensitive
check_span match (Span.Value (0.up_to 2) "MY")
Test.specify "should allow regexes in match" <|
hello = "Hello World!"
check_span (hello.find ".o" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 5) "Hello World!")
check_span_all (hello.find_all ".o") [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
check_span ("foobar".find "BAR" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 6) "foobar")
## Regex matching does not do case folding
"Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
Test.specify "should allow access to match groups by number" <|
"abcddd".find "ab(c(d+))" . group 0 . text . should_equal "abcddd"
"abcddd".find "ab(c(d+))" . group 1 . text . should_equal "cddd"
"abcddd".find "ab(c(d+))" . group 2 . text . should_equal "ddd"
Test.specify "should allow access to match groups by name" <|
"abcddd".find "ab(?<cee>c(d+))" . group "cee" . text . should_equal "cddd"
Test.specify "should throw No_Such_Group for an out-of-range group number" <|
"abcddd".find "ab(c(d+))" . group 3 . text . should_fail_with No_Such_Group
"abcddd".find "ab(c(d+))" . group 12 . text . should_fail_with No_Such_Group
"abcddd".find "ab(c(d+))" . group (-1) . text . should_fail_with No_Such_Group
Test.specify "should throw No_Such_Group for an invalid group name" <|
"abcddd".find "ab(?<cee>c(d+))" . group "dee" . text . should_fail_with No_Such_Group
Test.specify "should throw No_Such_Group for an invalid group name (when there are no named groups at all)" <|
"abcddd".find "ab(c(d+))" . group "dee" . text . should_fail_with No_Such_Group
Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
"abcddd".find "ab(c(((((((" . group 0 . text . should_fail_with Regex_Syntax_Error
Test.specify ".group should return Nothing if the group did not participate in the match" <|
match_c = "abc".find "ab((c)|(d))"
match_c.group 1 . text . should_equal "c"
match_c.group 2 . text . should_equal "c"
match_c.group 3 . should_equal Nothing
match_d = "abd".find "ab((c)|(d))"
match_d.group 1 . text . should_equal "d"
match_d.group 2 . should_equal Nothing
match_d.group 3 . text . should_equal "d"
Test.specify "should handle accents and other multi-point graphemes" <|
accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
# Check spans
check_span ((accents).find 'h') (Span.Value (3.up_to 4) accents)
check_span ((accents).find 'e\u{301}') (Span.Value (1.up_to 2) accents)
check_span_all ((accents).find_all 'h') [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
check_span_all ((accents).find_all 'e\u{301}') [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
# Check contents to make sure the spans' ranges are ok
accents.find 'h' . group 0 . text . should_equal 'h'
accents.find 'e\u{301}' . group 0 . text . should_equal 'e\u{301}'
Test.specify "should expand a partial-grapheme match to the whole grapheme" <|
'e\u{301}'.find '\u{301}' . group 0 . text . should_equal 'e\u{301}'
Test.specify "should handle the Unicode normalization" pending="Use this to test exposed normalization methods" <|
## This test passed for the builtin Java regex library, using
Pattern.CANON_EQ, but since that option is buggy and rarely use,
we won't attempt to recreate it with Truffle regex. Instead,
expose normalization methods to allow developers to do it
themselves.
accents = 'a\u{301}e\u{301}o\u{301}'
check_span (accents.find accent_1) (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
check_span_all ("abacadae".find_all "a[bc]") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
check_span_all ("abacadae".find_all "a.") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
check_span_all ("abacadae".find_all "a.*") [Span.Value (0.up_to 8) "abacadae"]
check_span_all ("abacadae".find_all "a.+?") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
Test.group "Text.match" <|
Test.specify "should default to regex" <|
"My Text: Goes Here".match "^My Text: (.+)$" . should_be_true
@ -1254,6 +1346,8 @@ spec =
Test.specify "should only match whole input" <|
"Hello".match "[a-z]" . should_be_false
"abcd".match "bcd" . should_be_false
"abcd".match "abc" . should_be_false
"x".match "[a-z]" . should_be_true
Test.specify "should be possible on unicode text" <|