From 3e7301ede7fd302b361727734d1906cca896dede Mon Sep 17 00:00:00 2001
From: Maxime Coste <mawww@kakoune.org>
Date: Wed, 6 Nov 2019 20:48:48 +1100
Subject: [PATCH] Support \x and \u escapes in regex character classes

Change \u to use 6 digits to cover the full unicode range.

Fixes #3172
---
 doc/pages/regex.asciidoc |  4 +-
 src/regex_impl.cc        | 97 ++++++++++++++++++++++++++--------------
 2 files changed, 66 insertions(+), 35 deletions(-)

diff --git a/doc/pages/regex.asciidoc b/doc/pages/regex.asciidoc
index a43fc5891..b03e580ff 100644
--- a/doc/pages/regex.asciidoc
+++ b/doc/pages/regex.asciidoc
@@ -23,7 +23,7 @@ Some literals are available as escape sequences:
 * `\0` matches the null character.
 * `\cX` matches the control-X character (X can be in `[A-Za-z]`).
 * `\xXX` matches the character whose codepoint is XX (in hexadecimal).
-* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal).
+* `\uXXXXXX` matches the character whose codepoint is XXXXXX (in hexadecimal).
 
 == Character classes
 
@@ -185,3 +185,5 @@ exists for ease of use or performance reasons:
   escapes, identity escapes like `\X` with X a non-special character
   are not accepted, to avoid confusions between `\h` meaning literal
   `h` in ECMAScript, and horizontal blank in Kakoune.
+* `\uXXXXXX` uses 6 digits to cover all of unicode, instead of relying
+  on ECMAScript UTF-16 surrogate pairs with 4 digits.
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index 02db75a03..4ad80c299 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -349,6 +349,29 @@ private:
         }
     }
 
+    Codepoint read_hex(size_t count)
+    {
+        Codepoint res = 0;
+        for (int i = 0; i < count; ++i)
+        {
+            if (at_end())
+                parse_error("unterminated hex sequence");
+            Codepoint digit = *m_pos++;
+            Codepoint digit_value;
+            if ('0' <= digit and digit <= '9')
+                digit_value = digit - '0';
+            else if ('a' <= digit and digit <= 'f')
+                digit_value = 0xa + digit - 'a';
+            else if ('A' <= digit and digit <= 'F')
+                digit_value = 0xa + digit - 'A';
+            else
+                parse_error(format("invalid hex digit '{}'", digit));
+
+            res = res * 16 + digit_value;
+        }
+        return res;
+    }
+
     NodeIndex atom_escape()
     {
         const Codepoint cp = *m_pos++;
@@ -381,29 +404,6 @@ private:
                 return new_node(ParsedRegex::Literal, control.value);
         }
 
-        auto read_hex = [this](size_t count)
-        {
-            Codepoint res = 0;
-            for (int i = 0; i < count; ++i)
-            {
-                if (at_end())
-                    parse_error("unterminated hex sequence");
-                Codepoint digit = *m_pos++;
-                Codepoint digit_value;
-                if ('0' <= digit and digit <= '9')
-                    digit_value = digit - '0';
-                else if ('a' <= digit and digit <= 'f')
-                    digit_value = 0xa + digit - 'a';
-                else if ('A' <= digit and digit <= 'F')
-                    digit_value = 0xa + digit - 'A';
-                else
-                    parse_error(format("invalid hex digit '{}'", digit));
-
-                res = res * 16 + digit_value;
-            }
-            return res;
-        };
-
         if (cp == '0')
             return new_node(ParsedRegex::Literal, '\0');
         else if (cp == 'c')
@@ -418,7 +418,7 @@ private:
         else if (cp == 'x')
             return new_node(ParsedRegex::Literal, read_hex(2));
         else if (cp == 'u')
-            return new_node(ParsedRegex::Literal, read_hex(4));
+            return new_node(ParsedRegex::Literal, read_hex(6));
 
         if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
             return new_node(ParsedRegex::Literal, cp);
@@ -470,6 +470,20 @@ private:
             if (at_end())
                 break;
 
+            auto read_escaped_char = [this]() {
+                Codepoint cp = *m_pos++;
+                auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
+                if (it != std::end(control_escapes))
+                    return it->value;
+                if (cp == 'x')
+                    return read_hex(2);
+                if (cp == 'u')
+                    return read_hex(6);
+                if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
+                    parse_error(format("unknown character class escape '{}'", cp));
+                return cp;
+            };
+
             if (cp == '\\')
             {
                 auto it = find_if(character_class_escapes,
@@ -481,14 +495,7 @@ private:
                     continue;
                 }
                 else // its an escaped character
-                {
-                    cp = *m_pos++;
-                    auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
-                    if (it != std::end(control_escapes))
-                        cp = it->value;
-                    else if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
-                        parse_error(format("unknown character class escape '{}'", cp));
-                }
+                    cp = read_escaped_char();
             }
 
             CharacterClass::Range range = { cp, cp };
@@ -498,7 +505,10 @@ private:
                     break;
                 if (*m_pos != ']')
                 {
-                    range.max = *m_pos++;
+                    cp = *m_pos++;
+                    if (cp == '\\')
+                        cp = read_escaped_char();
+                    range.max = cp;
                     if (range.min > range.max)
                         parse_error("invalid range specified");
                 }
@@ -1522,13 +1532,32 @@ auto test_regex = UnitTest{[]{
         kak_assert(vm.exec("bCa"));
     }
 
+    {
+        TestVM<> vm{R"([\t-\r]+)"};
+        kak_assert(vm.exec("\t\n\v\f\r"));
+    }
+
+    {
+        TestVM<> vm{R"([^\x00-\x7F]+)"};
+        kak_assert(not vm.exec("ascii"));
+        kak_assert(vm.exec("←↑→↓"));
+        kak_assert(vm.exec("😄😊😉"));
+    }
+
+    {
+        TestVM<> vm{R"([^\u000000-\u00ffff]+)"};
+        kak_assert(not vm.exec("ascii"));
+        kak_assert(not vm.exec("←↑→↓"));
+        kak_assert(vm.exec("😄😊😉"));
+    }
+
     {
         TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(д)"};
         kak_assert(vm.exec("д", RegexExecFlags::None));
     }
 
     {
-        TestVM<> vm{R"(\0\x0A\u260e\u260F)"};
+        TestVM<> vm{R"(\0\x0A\u00260e\u00260F)"};
         const char str[] = "\0\n☎☏"; // work around the null byte in the literal
         kak_assert(vm.exec({str, str + sizeof(str)-1}));
     }