Regex.Compile using Truffle Regex, update find, match and match_all (#5785)

2024-11-22 22:10:15 +03:00 · 2023-03-10 21:49:50 +00:00 · 2023-03-10 21:49:50 +00:00 · 7887fb8d40
commit 7887fb8d40
parent 91ef8acf35
10 changed files with 533 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -333,6 +333,8 @@
  and renamed them to `match`, `find`, `find_all` (respectively).][5721]
 - [Updated `rename_columns` to new API. Added `first_row`, `second_row` and
  `last_row` to Table types][5719]
+- [Remove many regex compile flags; separated `match` into `match` and
+  `match_all`.][5785]
 - [Aligned names of columns created by column operations.][5850]

 [debug-shortcuts]:
@ -509,10 +511,11 @@
 [5699]: https://github.com/enso-org/enso/pull/5699
 [5719]: https://github.com/enso-org/enso/pull/5719
 [5721]: https://github.com/enso-org/enso/pull/5721
-[5779]: https://github.com/enso-org/enso/pull/5779
 [5757]: https://github.com/enso-org/enso/pull/5757
-[5802]: https://github.com/enso-org/enso/pull/5802
 [5774]: https://github.com/enso-org/enso/pull/5774
+[5779]: https://github.com/enso-org/enso/pull/5779
+[5785]: https://github.com/enso-org/enso/pull/5785
+[5802]: https://github.com/enso-org/enso/pull/5802
 [5850]: https://github.com/enso-org/enso/pull/5850

 #### Enso Compiler
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@ -12,10 +12,11 @@ import project.Data.Text.Case_Sensitivity.Case_Sensitivity
 import project.Data.Text.Encoding.Encoding
 import project.Data.Text.Location
 import project.Data.Text.Matching_Mode.Matching_Mode
-import project.Data.Text.Regex
 import project.Data.Text.Regex.Match.Match
 import project.Data.Text.Regex.Regex_Mode.Regex_Mode
 import project.Data.Text.Regex_Matcher.Regex_Matcher
+import project.Data.Text.Regex_2
+import project.Data.Text.Regex_2.Regex_Syntax_Error
 import project.Data.Text.Span.Span
 import project.Data.Text.Span.Utf_16_Span
 import project.Data.Text.Text
@ -23,7 +24,6 @@ import project.Data.Text.Text_Matcher.Text_Matcher
 import project.Data.Text.Text_Sub_Range.Codepoint_Ranges
 import project.Data.Text.Text_Sub_Range.Text_Sub_Range
 import project.Data.Vector.Vector
-import project.Error.Common.Compile_Error
 import project.Error.Common.Index_Out_Of_Bounds
 import project.Error.Error
 import project.Error.Encoding_Error.Encoding_Error
@ -227,10 +227,10 @@ Text.characters self =
         example_find_insensitive =
             ## This matches `aBc` @ character 11
             "aabbbbccccaaBcaaaa".find "a[ab]c" Case_Sensitivity.Insensitive
-Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Compile_Error
+Text.find : Text -> Case_Sensitivity -> Match | Nothing ! Regex_Syntax_Error
 Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
    case_insensitive = case_sensitivity.is_case_insensitive_in_memory
-    Regex.compile pattern case_insensitive=case_insensitive . match self Matching_Mode.First
+    Regex_2.compile pattern case_insensitive=case_insensitive . match self

 ## Finds all the matches of the regular expression `pattern` in `self`,
   returning a Vector. If not found, will be an empty Vector.
@ -249,12 +249,10 @@ Text.find self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
         example_find_all_insensitive =
              ## This matches `aABbbbc` @ character 0 and `aBC` @ character 11
             "aABbbbccccaaBCaaaa".find_all "a[ab]+c" Case_Sensitivity.Insensitive
-Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Compile_Error
+Text.find_all : Text -> Case_Sensitivity -> Vector Match ! Regex_Syntax_Error
 Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
    case_insensitive = case_sensitivity.is_case_insensitive_in_memory
-    case Regex.compile pattern case_insensitive=case_insensitive . match self Regex_Mode.All of
-        Nothing -> []
-        matches -> matches
+    Regex_2.compile pattern case_insensitive=case_insensitive . match_all self

 ## ALIAS Check Matches

@ -276,10 +274,10 @@ Text.find_all self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
             regex = ".+ct@.+"
             # Evaluates to true
             "CONTACT@enso.org".match regex Case_Sensitivity.Insensitive
-Text.match : Text -> Case_Sensitivity -> Boolean ! Compile_Error
+Text.match : Text -> Case_Sensitivity -> Boolean ! Regex_Syntax_Error
 Text.match self pattern=".*" case_sensitivity=Case_Sensitivity.Sensitive =
    case_insensitive = case_sensitivity.is_case_insensitive_in_memory
-    compiled_pattern = Regex.compile pattern case_insensitive=case_insensitive
+    compiled_pattern = Regex_2.compile pattern case_insensitive=case_insensitive
    compiled_pattern.matches self

 ## ALIAS Split Text
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Prim_Text_Helper.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Prim_Text_Helper.enso
@ -1,5 +1,6 @@
 ## Internal text utilities for inspecting text primitives.

+import project.Any.Any
 import project.Data.Text.Text

 ## PRIVATE
@ -7,3 +8,17 @@ import project.Data.Text.Text
   Forces flattening of a text value.
 optimize : Text
 optimize text = @Builtin_Method "Prim_Text_Helper.optimize"
+
+## PRIVATE
+
+   Compile the regex using the Truffle regex library.
+
+   Returns a Java RegexObject (Truffle)
+   (See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
+
+   Arguments:
+   - pattern: the regex to compile
+   - options: string containing traditional regex flags (for example, "g"
+    as in "/foo/g"
+compile_regex : Text -> Text -> Any
+compile_regex pattern options = @Builtin_Method "Prim_Text_Helper.compile_regex"
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Match_2.enso
@ -0,0 +1,67 @@
+from project.Data.Boolean import Boolean, True, False
+import project.Data.Numbers.Integer
+import project.Data.Range.Range
+import project.Data.Text.Span.Span
+import project.Data.Text.Span.Utf_16_Span
+import project.Data.Text.Text
+import project.Nothing.Nothing
+
+type Match_2
+    ## internal_regex_result : RegexResult (Truffle)
+       (See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
+    Value (pattern : Pattern_2) (internal_regex_result : Any) (input : Text)
+
+    ## PRIVATE
+       Returns the start character of group n.
+
+       Arguments:
+       - n: the integer group number. Note that the groups explicitly
+         defined in the regex are numbered starting at 1; group 0 refers to the
+         entire match range.
+    start : Integer -> Integer
+    start self n = self.internal_regex_result.getStart n
+
+    ## PRIVATE
+       Returns the start character of group n
+
+       Arguments:
+       - n: the integer group number
+    end : Integer -> Integer
+    end self n = self.internal_regex_result.getEnd n
+
+    ## PRIVATE
+
+       Gets the text matched by the group with the provided identifier, or
+       `Nothing` if the group did not participate in the match. If no such group
+       exists for the provided identifier, a `No_Such_Group` is returned.
+
+       Arguments:
+       - id: The integer index or name of that group.
+
+       ? The Full Match
+         The group with index 0 is always the full match of the pattern.
+
+       ? Named Groups by Index
+         If the regex contained named groups, these may also be accessed by
+         index based on their position in the pattern.
+
+       Note that it is possible for a group to "not participate in the match",
+       for example with a disjunction. In the example below, the "(d)" group
+       does not participate -- it neither matches nor fails.
+
+            "ab((c)|(d))".find "abc"
+
+       In this case, the group id for "(d)", which is 3, is a valid group id and
+       (Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
+       Match_2.group will return Nothing.
+    group : Integer | Text -> Span
+    group self id =
+        n = self.pattern.lookup_group id
+        start = self.start n
+        end = self.end n
+        does_not_participate = start == -1 || end == -1
+        case does_not_participate of
+            True -> Nothing
+            False ->
+                range = Range.new (self.start n) (self.end n)
+                (Utf_16_Span.Value range self.input).to_grapheme_span
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex/Pattern_2.enso
@ -0,0 +1,177 @@
+import project.Any.Any
+#import project.Data.Boolean.Boolean
+from project.Data.Boolean import Boolean, True, False
+import project.Data.Map.Map
+import project.Data.Numbers.Integer
+import project.Data.Range.Extensions
+import project.Data.Range.Range
+import project.Data.Text.Span.Span
+import project.Data.Text.Span.Utf_16_Span
+import project.Data.Text.Regex.Match_2.Match_2
+import project.Data.Text.Regex_2.No_Such_Group
+import project.Data.Text.Text
+import project.Data.Vector.Vector
+import project.Error.Error
+import project.Meta
+import project.Nothing.Nothing
+import project.Panic.Panic
+import project.Polyglot
+
+polyglot java import org.enso.base.Text_Utils
+
+type Pattern_2
+    ## internal_regex_object : RegexObject (Truffle)
+       (See https://github.com/oracle/graal/blob/master/regex/docs/README.md)
+    Value (internal_regex_object : Any)
+
+    ## Returns `True` if the input matches against the pattern described by
+       `self`, otherwise `False`.
+
+       Arguments:
+       - input: The text to check for matching.
+    matches : Text -> Boolean
+    matches self input =
+        m = self.internal_regex_object.exec input 0
+        m . isMatch && m.getStart 0 == 0 && m.getEnd 0 == input.length
+
+    ## Tries to match the provided `input` against the pattern `self`.
+
+       Arguments:
+       - input: The text to match the pattern described by `self` against.
+    match : Text -> Match_2 | Nothing
+    match self input =
+        it = Match_Iterator.new self input
+        case it.next of
+            Match_Iterator_Value.Next _ match _ -> match
+            Match_Iterator_Value.Last _ -> Nothing
+
+    ## Tries to match the provided `input` against the pattern `self`.
+
+       Returns a Vector of Match_2 objects.
+
+       Arguments:
+       - input: The text to match the pattern described by `self` against.
+    match_all : Text -> Vector Match_2
+    match_all self input =
+        builder = Vector.new_builder
+        it = Match_Iterator.new self input
+        go it = case it.next of
+            Match_Iterator_Value.Next _ match next_it ->
+                builder.append match
+                go next_it
+            Match_Iterator_Value.Last _ -> Nothing
+        go it
+        builder.to_vector
+
+    ## PRIVATE
+
+       Look up a match group name or number, and check that it is valid.
+
+       Arguments:
+       - id: The name or number of the group that was asked for.
+
+       Returns: a group number.
+
+       A group number is invalid if it is outside the range of groups
+       that were in the original pattern.
+
+       A group name is invalid if it was not defined in the original pattern.
+
+       A group name is an alias for a group number; if a name is passed to
+       this method, it returns the corresponding group number.
+
+       Note that it is possible for a group to "not participate in the match",
+       for example with a disjunction. In the example below, the "(d)" group
+       does not participate -- it neither matches nor fails.
+
+            "ab((c)|(d))".find "abc"
+
+       In this case, the group id for "(d)", which is 3, is a valid group id and
+       (Pattern_2.lookup_group 3) will return 3. If the caller tries to get group 3,
+       Match_2.group will return Nothing.
+
+    lookup_group : Integer | Text -> Integer ! No_Such_Group
+    lookup_group self id =
+        case id of
+            n : Integer -> case (n >= 0 && n < self.internal_regex_object.groupCount) of
+                True -> n
+                False -> Error.throw (No_Such_Group.Error n)
+            name : Text ->
+                # Maps name to number
+                groups = self.internal_regex_object.groups
+
+                n = case groups of
+                        # If Nothing, there are no named groups
+                        Nothing -> Error.throw (No_Such_Group.Error name)
+                        _ ->
+                            qq = (read_group_map groups name)
+                            case qq of
+                                Nothing -> Nothing
+                                n : Integer -> n
+                case n of
+                    _ : Integer -> n
+                    Nothing -> Error.throw (No_Such_Group.Error name)
+
+## PRIVATE
+
+    Performs the regex match, and iterates through the results. Yields both
+    the matched parts of the string, and the 'filler' parts between them.
+
+    At each step, it yields a Match_Iterator_Value, whivch has either a filler
+    and a match, or just the final filler. A Match_Iterator_Value.Last value is
+    return at the end, and only at the end.
+type Match_Iterator
+    new : Pattern_2 -> Text -> Match_Iterator
+    new pattern input = Match_Iterator.Value pattern input 0
+
+    Value (pattern : Pattern_2) (input : Text) (cursor : Integer)
+
+    next : Match_Iterator_Value
+    next self =
+        regex_result = self.pattern.internal_regex_object.exec self.input self.cursor
+        case regex_result.isMatch of
+            False ->
+                filler_range = Range.new self.cursor (Text_Utils.char_length self.input)
+                filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
+                Match_Iterator_Value.Last filler_span
+            True ->
+                match_start = regex_result.getStart 0
+                filler_range = Range.new self.cursor match_start
+                filler_span = (Utf_16_Span.Value filler_range self.input).to_grapheme_span
+                match = Match_2.Value self.pattern regex_result self.input
+                next_cursor = match.end 0
+                next_iterator = Match_Iterator.Value self.pattern self.input next_cursor
+                Match_Iterator_Value.Next filler_span match next_iterator
+
+    to_text_debug : Vector Text
+    to_text_debug self =
+        vb = Vector.new_builder
+        go it = case it.next of
+            Match_Iterator_Value.Next filler match next_it ->
+                vb.append ('\"' + filler.text + '\"')
+                vb.append ("/" + (match.span 0).text + "/")
+                go next_it
+            Match_Iterator_Value.Last filler ->
+                vb.append ('\"' + filler.text + '\"')
+        go self
+        vb.to_vector
+
+## PRIVATE
+type Match_Iterator_Value
+  Next (filler : Span) (match : Match_2) (next_iterator : Match_Iterator)
+  Last (filler : Span)
+
+## PRIVATE
+group_map_contains : Any -> Text -> Boolean
+group_map_contains map elem =
+    members = Polyglot.get_members map
+    as_vec = Vector.from_polyglot_array members
+    as_vec.contains elem
+
+## PRIVATE
+read_group_map : Any -> Text -> Integer | Nothing
+read_group_map map name =
+    case group_map_contains map name of
+        True -> Polyglot.get_member map name
+        False -> Nothing
+
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Regex_2.enso
@ -0,0 +1,82 @@
+import project.Any.Any
+import project.Data.Numbers.Integer
+import project.Data.Text.Prim_Text_Helper
+import project.Data.Text.Regex.Pattern_2.Pattern_2
+import project.Data.Text.Text
+import project.Error.Error
+import project.Error.Illegal_Argument.Illegal_Argument
+import project.Nothing.Nothing
+import project.Panic.Panic
+from project.Data.Boolean import Boolean, True, False
+from project.Error.Common import Syntax_Error
+
+polyglot java import java.util.regex.Pattern as Java_Pattern
+
+## Compile the provided `expression` into a regex pattern that can be used for
+   matching.
+
+   Arguments
+   - expression: The text representing the regular expression that you want to
+     compile.
+   - case_insensitive: Enables or disables case-insensitive matching. Case
+     insensitive matching behaves as if it normalises the case of all input
+     text before matching on it.
+
+   ? Why Compile?
+     While many regex engines are able to cache ad-hoc patterns, it is often
+     useful to be able to manually retain a pattern that you have computed. This
+     function exists so you can hold onto the resultant `Pattern_2` object,
+     instead of immediately proceeding to match using it.
+compile : Text -> Boolean | Nothing -> Pattern_2 ! Regex_Syntax_Error
+compile self expression case_insensitive=Nothing =
+    options_string = if case_insensitive == True then "usgi" else "usg"
+
+    internal_regex_object = Panic.catch Syntax_Error (Prim_Text_Helper.compile_regex expression options_string) caught_panic->
+        Error.throw (Regex_Syntax_Error.Error (caught_panic.payload.message))
+
+    Pattern_2.Value internal_regex_object
+
+## ADVANCED
+
+   Escape the special characters in `expression` such that the result is a
+   valid literal pattern for the original string.
+
+   Arguments:
+   - expression: The expression to escape metacharacters in.
+
+   > Example
+     Turn a Text into a regex that matches that string exactly.
+
+         import Standard.Base.Data.Text.Regex.Engine.Default as Default_Engine
+         import Standard.Base.Data.Text.Regex.Regex_Option.Regex_Option
+
+         example_escape =
+             literal_string = "\!\.|abcde"
+             engine = Default_Engine.new
+             engine.escape literal_string
+escape : Text -> Text
+escape self expression = Java_Pattern.quote expression
+
+## An error that is emitted when there is no such group in the match for the
+   provided `id`.
+
+   Arguments:
+   - id: The identifier of the group that was asked for but does not exist.
+type No_Such_Group
+    Error (id : Text | Integer)
+
+    ## PRIVATE
+
+       Provides a human-readable representation of the `No_Such_Group`.
+    to_display_text : Text
+    to_display_text self = case self.id of
+        _ : Integer -> "No group exists with the index " + self.id.to_text + "."
+        _ : Text -> "No group exists with the name " + self.id + "."
+
+## A syntax error reported by the Truffle regex compiler.
+type Regex_Syntax_Error
+    ## PRIVATE
+
+       Arguments:
+       - message: A description of the erroneous syntax.
+    Error message
--- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
+++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Span.enso
@ -7,7 +7,7 @@

         example_span =
             text = "Hello!"
-             Span.Value 0 3 text
+             Span.Value (Range.new 0 3) text

 import project.Data.Numbers.Integer
 import project.Data.Pair.Pair
--- a/engine/runtime/src/main/java/org/enso/interpreter/node/expression/builtin/text/RegexCompileNode.java
+++ b/engine/runtime/src/main/java/org/enso/interpreter/node/expression/builtin/text/RegexCompileNode.java
@ -0,0 +1,70 @@
+package org.enso.interpreter.node.expression.builtin.text;
+
+import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
+import com.oracle.truffle.api.dsl.Cached;
+import com.oracle.truffle.api.dsl.Specialization;
+import com.oracle.truffle.api.exception.AbstractTruffleException;
+import com.oracle.truffle.api.nodes.Node;
+import com.oracle.truffle.api.source.Source;
+
+import org.enso.interpreter.dsl.BuiltinMethod;
+import org.enso.interpreter.runtime.builtin.Builtins;
+import org.enso.interpreter.runtime.callable.atom.Atom;
+import org.enso.interpreter.runtime.data.text.Text;
+import org.enso.interpreter.runtime.EnsoContext;
+import org.enso.interpreter.runtime.error.PanicException;
+
+@BuiltinMethod(
+    type = "Prim_Text_Helper",
+    name = "compile_regex",
+    description = "Compiles a regexp.",
+    autoRegister = false)
+public abstract class RegexCompileNode extends Node {
+  static RegexCompileNode build() {
+    return RegexCompileNodeGen.create();
+  }
+
+  abstract Object execute(Object pattern, Object options);
+
+  @Specialization(
+      limit = "3",
+      guards = {
+        "pattern.toString().equals(cachedPattern)",
+        "options.toString().equals(cachedOptions)"
+      })
+  Object parseRegexPattern(
+      Text pattern,
+      Text options,
+      @Cached("pattern.toString()") String cachedPattern,
+      @Cached("options.toString()") String cachedOptions,
+      @Cached("compile(cachedPattern, cachedOptions)") Object regex) {
+    return regex;
+  }
+
+  @Specialization
+  Object alwaysCompile(Text pattern, Text options) {
+    return compile(pattern.toString(), options.toString());
+  }
+
+  @TruffleBoundary
+  Object compile(String pattern, String options) {
+    var ctx = EnsoContext.get(this);
+    var env = ctx.getEnvironment();
+    var s = "Flavor=ECMAScript/" + pattern + "/" + options;
+    var src =
+        Source.newBuilder("regex", s, "myRegex")
+            .mimeType("application/tregex")
+            .internal(true)
+            .build();
+
+    try {
+      var regex = env.parseInternal(src).call();
+      return regex;
+    } catch (AbstractTruffleException e) {
+      Builtins builtins = ctx.getBuiltins();
+      String msg = "Regex parse error: " + e.getMessage();
+      Atom err = builtins.error().makeSyntaxError(msg);
+      throw new PanicException(err, this);
+    }
+  }
+}
--- a/engine/runtime/src/test/java/org/enso/compiler/ParseStdLibTest.java
+++ b/engine/runtime/src/test/java/org/enso/compiler/ParseStdLibTest.java
@ -166,6 +166,8 @@ public final class ParseStdLibTest extends TestCase {
            "Data/Range.enso",
            "Data/Sort_Column_Selector.enso",
            "Data/Text/Extensions.enso",
+            "Data/Text/Regex/Match_2.enso",
+            "Data/Text/Regex/Pattern_2.enso",
            "Data/Text/Regex/Regex_Mode.enso",
            "Data/Value_Type.enso",
            "Data/Vector.enso",
--- a/test/Tests/src/Data/Text_Spec.enso
+++ b/test/Tests/src/Data/Text_Spec.enso
@ -1,4 +1,6 @@
 from Standard.Base import all
+import Standard.Base.Data.Text.Regex_2.No_Such_Group
+import Standard.Base.Data.Text.Regex_2.Regex_Syntax_Error
 import Standard.Base.Data.Text.Span.Span
 import Standard.Base.Error.Common.Index_Out_Of_Bounds
 import Standard.Base.Error.Common.Type_Error
@ -57,13 +59,17 @@ type Manual
     - Note that currently the regex-based operations may not handle the edge
       cases described above too well.
 spec =
-    check_span result span = result.span 0 . to_grapheme_span . should_equal span
-    check_span_all result spans = result . map (m-> (m.span 0).to_grapheme_span) . should_equal spans
+    check_span match span =
+        match . should_not_equal Nothing
+        match.group 0 . should_equal span
+    check_span_all match spans = match . map (m-> (m.group 0)) . should_equal spans
+
+    accent_1 = '\u00E9'
+    accent_2 = '\u0065\u{301}'
+
    Test.group "Text" <|
        kshi = '\u0915\u094D\u0937\u093F'
        facepalm = '\u{1F926}\u{1F3FC}\u200D\u2642\uFE0F'
-        accent_1 = '\u00E9'
-        accent_2 = '\u0065\u{301}'
        utf_8_whitespace = 'foo\n bar     baz \u202F quux'
        utf_8_whitespace_split = ["foo", "bar", "baz", "quux"]
        sentences = '''
@ -1186,7 +1192,25 @@ spec =
            ## Regex matching does not do case folding
            "Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing

-            ## But it should handle the Unicode normalization
+        Test.specify "should handle accents and other multi-point graphemes" <|
+            accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
+
+            # Check spans
+            check_span ((accents).find 'h') (Span.Value (3.up_to 4) accents)
+            check_span ((accents).find 'e\u{301}') (Span.Value (1.up_to 2) accents)
+            check_span_all ((accents).find_all 'h') [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
+            check_span_all ((accents).find_all 'e\u{301}') [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
+
+            # Check contents to make sure the spans' ranges are ok
+            accents.find 'h' . group 0 . text . should_equal 'h'
+            accents.find 'e\u{301}' . group 0 . text . should_equal 'e\u{301}'
+
+        Test.specify "should handle the Unicode normalization" pending="TODO: Rewrite this to use normalization methods explicitly" <|
+            ## This test passed for the builtin Java regex library, using
+               Pattern.CANON_EQ, but since that option is buggy and rarely use,
+               we won't attempt to recreate it with Truffle regex. Instead,
+               expose normalization methods to allow developers to do it
+               themselves.
            accents = 'a\u{301}e\u{301}o\u{301}'
            check_span (accents.find accent_1) (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')

@ -1208,12 +1232,6 @@ spec =

            "aaa aaa".locate "aa" mode=Matching_Mode.Last case_sensitivity=Case_Sensitivity.Sensitive . should_equal (Span.Value (5.up_to 7) "aaa aaa")

-        Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
-            check_span_all ("abacadae".find_all "a[bc]") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
-            check_span_all ("abacadae".find_all "a.") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
-            check_span_all ("abacadae".find_all "a.*") [Span.Value (0.up_to 8) "abacadae"]
-            check_span_all ("abacadae".find_all "a.+?") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
-
        Test.specify "should default to exact matching for locate but regex for match" <|
            txt = "aba[bc]adacae"
            "ab".locate "ab" . should_equal (Span.Value (0.up_to 2) "ab")
@ -1230,7 +1248,7 @@ spec =
            check_span (txt.find "a[bc]") (Span.Value (0.up_to 2) "aba[bc]adacae")
            check_span_all (txt.find_all "a[bc]") [Span.Value (0.up_to 2) "aba[bc]adacae", Span.Value (9.up_to 11) "aba[bc]adacae"]

-    Test.group "Regex matching" <|
+    Test.group "Regex: find and find_all" <|
        Test.specify "should be possible on text" <|
            match = "My Text: Goes Here".find "^My Text: (.+)$"
            check_span match (Span.Value (0.up_to 18) "My Text: Goes Here")
@ -1245,6 +1263,80 @@ spec =
            match = "MY".find "my" Case_Sensitivity.Insensitive
            check_span match (Span.Value (0.up_to 2) "MY")

+        Test.specify "should allow regexes in match" <|
+            hello = "Hello World!"
+
+            check_span (hello.find ".o" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 5) "Hello World!")
+            check_span_all (hello.find_all ".o") [Span.Value (3.up_to 5) "Hello World!", Span.Value (6.up_to 8) "Hello World!"]
+
+            check_span ("foobar".find "BAR" Case_Sensitivity.Insensitive) (Span.Value (3.up_to 6) "foobar")
+
+            ## Regex matching does not do case folding
+            "Strasse".find "ß" Case_Sensitivity.Insensitive . should_equal Nothing
+
+        Test.specify "should allow access to match groups by number" <|
+            "abcddd".find "ab(c(d+))" . group 0 . text . should_equal "abcddd"
+            "abcddd".find "ab(c(d+))" . group 1 . text . should_equal "cddd"
+            "abcddd".find "ab(c(d+))" . group 2 . text . should_equal "ddd"
+
+        Test.specify "should allow access to match groups by name" <|
+            "abcddd".find "ab(?<cee>c(d+))" . group "cee" . text . should_equal "cddd"
+
+        Test.specify "should throw No_Such_Group for an out-of-range group number" <|
+            "abcddd".find "ab(c(d+))" . group 3 . text . should_fail_with No_Such_Group
+            "abcddd".find "ab(c(d+))" . group 12 . text . should_fail_with No_Such_Group
+            "abcddd".find "ab(c(d+))" . group (-1) . text . should_fail_with No_Such_Group
+
+        Test.specify "should throw No_Such_Group for an invalid group name" <|
+            "abcddd".find "ab(?<cee>c(d+))" . group "dee" . text . should_fail_with No_Such_Group
+
+        Test.specify "should throw No_Such_Group for an invalid group name (when there are no named groups at all)" <|
+            "abcddd".find "ab(c(d+))" . group "dee" . text . should_fail_with No_Such_Group
+
+        Test.specify "should throw Regex_Syntax_Error for a regex with incorrect syntax" <|
+            "abcddd".find "ab(c(((((((" . group 0 . text . should_fail_with Regex_Syntax_Error
+
+        Test.specify ".group should return Nothing if the group did not participate in the match" <|
+            match_c = "abc".find "ab((c)|(d))"
+            match_c.group 1 . text . should_equal "c"
+            match_c.group 2 . text . should_equal "c"
+            match_c.group 3 . should_equal Nothing
+            match_d = "abd".find "ab((c)|(d))"
+            match_d.group 1 . text . should_equal "d"
+            match_d.group 2 . should_equal Nothing
+            match_d.group 3 . text . should_equal "d"
+
+        Test.specify "should handle accents and other multi-point graphemes" <|
+            accents = 'a\u{301}e\u{301}o\u{301}he\u{301}h'
+
+            # Check spans
+            check_span ((accents).find 'h') (Span.Value (3.up_to 4) accents)
+            check_span ((accents).find 'e\u{301}') (Span.Value (1.up_to 2) accents)
+            check_span_all ((accents).find_all 'h') [Span.Value (3.up_to 4) accents, Span.Value (5.up_to 6) accents]
+            check_span_all ((accents).find_all 'e\u{301}') [Span.Value (1.up_to 2) accents, Span.Value (4.up_to 5) accents]
+
+            # Check contents to make sure the spans' ranges are ok
+            accents.find 'h' . group 0 . text . should_equal 'h'
+            accents.find 'e\u{301}' . group 0 . text . should_equal 'e\u{301}'
+
+        Test.specify "should expand a partial-grapheme match to the whole grapheme" <|
+            'e\u{301}'.find '\u{301}' . group 0 . text . should_equal 'e\u{301}'
+
+        Test.specify "should handle the Unicode normalization" pending="Use this to test exposed normalization methods" <|
+            ## This test passed for the builtin Java regex library, using
+               Pattern.CANON_EQ, but since that option is buggy and rarely use,
+               we won't attempt to recreate it with Truffle regex. Instead,
+               expose normalization methods to allow developers to do it
+               themselves.
+            accents = 'a\u{301}e\u{301}o\u{301}'
+            check_span (accents.find accent_1) (Span.Value (1.up_to 2) 'a\u{301}e\u{301}o\u{301}')
+
+        Test.specify "should allow to match one or more occurrences of a pattern in the text" <|
+            check_span_all ("abacadae".find_all "a[bc]") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae"]
+            check_span_all ("abacadae".find_all "a.") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
+            check_span_all ("abacadae".find_all "a.*") [Span.Value (0.up_to 8) "abacadae"]
+            check_span_all ("abacadae".find_all "a.+?") [Span.Value (0.up_to 2) "abacadae", Span.Value (2.up_to 4) "abacadae", Span.Value (4.up_to 6) "abacadae", Span.Value (6.up_to 8) "abacadae"]
+
    Test.group "Text.match" <|
        Test.specify "should default to regex" <|
            "My Text: Goes Here".match "^My Text: (.+)$" . should_be_true
@ -1254,6 +1346,8 @@ spec =

        Test.specify "should only match whole input" <|
            "Hello".match "[a-z]" . should_be_false
+            "abcd".match "bcd" . should_be_false
+            "abcd".match "abc" . should_be_false
            "x".match "[a-z]" . should_be_true

        Test.specify "should be possible on unicode text" <|