From 3e7301ede7fd302b361727734d1906cca896dede Mon Sep 17 00:00:00 2001 From: Maxime Coste Date: Wed, 6 Nov 2019 20:48:48 +1100 Subject: [PATCH] Support \x and \u escapes in regex character classes Change \u to use 6 digits to cover the full unicode range. Fixes #3172 --- doc/pages/regex.asciidoc | 4 +- src/regex_impl.cc | 97 ++++++++++++++++++++++++++-------------- 2 files changed, 66 insertions(+), 35 deletions(-) diff --git a/doc/pages/regex.asciidoc b/doc/pages/regex.asciidoc index a43fc5891..b03e580ff 100644 --- a/doc/pages/regex.asciidoc +++ b/doc/pages/regex.asciidoc @@ -23,7 +23,7 @@ Some literals are available as escape sequences: * `\0` matches the null character. * `\cX` matches the control-X character (X can be in `[A-Za-z]`). * `\xXX` matches the character whose codepoint is XX (in hexadecimal). -* `\uXXXX` matches the character whose codepoint is XXXX (in hexadecimal). +* `\uXXXXXX` matches the character whose codepoint is XXXXXX (in hexadecimal). == Character classes @@ -185,3 +185,5 @@ exists for ease of use or performance reasons: escapes, identity escapes like `\X` with X a non-special character are not accepted, to avoid confusions between `\h` meaning literal `h` in ECMAScript, and horizontal blank in Kakoune. +* `\uXXXXXX` uses 6 digits to cover all of unicode, instead of relying + on ECMAScript UTF-16 surrogate pairs with 4 digits. diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 02db75a03..4ad80c299 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -349,6 +349,29 @@ private: } } + Codepoint read_hex(size_t count) + { + Codepoint res = 0; + for (int i = 0; i < count; ++i) + { + if (at_end()) + parse_error("unterminated hex sequence"); + Codepoint digit = *m_pos++; + Codepoint digit_value; + if ('0' <= digit and digit <= '9') + digit_value = digit - '0'; + else if ('a' <= digit and digit <= 'f') + digit_value = 0xa + digit - 'a'; + else if ('A' <= digit and digit <= 'F') + digit_value = 0xa + digit - 'A'; + else + parse_error(format("invalid hex digit '{}'", digit)); + + res = res * 16 + digit_value; + } + return res; + } + NodeIndex atom_escape() { const Codepoint cp = *m_pos++; @@ -381,29 +404,6 @@ private: return new_node(ParsedRegex::Literal, control.value); } - auto read_hex = [this](size_t count) - { - Codepoint res = 0; - for (int i = 0; i < count; ++i) - { - if (at_end()) - parse_error("unterminated hex sequence"); - Codepoint digit = *m_pos++; - Codepoint digit_value; - if ('0' <= digit and digit <= '9') - digit_value = digit - '0'; - else if ('a' <= digit and digit <= 'f') - digit_value = 0xa + digit - 'a'; - else if ('A' <= digit and digit <= 'F') - digit_value = 0xa + digit - 'A'; - else - parse_error(format("invalid hex digit '{}'", digit)); - - res = res * 16 + digit_value; - } - return res; - }; - if (cp == '0') return new_node(ParsedRegex::Literal, '\0'); else if (cp == 'c') @@ -418,7 +418,7 @@ private: else if (cp == 'x') return new_node(ParsedRegex::Literal, read_hex(2)); else if (cp == 'u') - return new_node(ParsedRegex::Literal, read_hex(4)); + return new_node(ParsedRegex::Literal, read_hex(6)); if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter return new_node(ParsedRegex::Literal, cp); @@ -470,6 +470,20 @@ private: if (at_end()) break; + auto read_escaped_char = [this]() { + Codepoint cp = *m_pos++; + auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; }); + if (it != std::end(control_escapes)) + return it->value; + if (cp == 'x') + return read_hex(2); + if (cp == 'u') + return read_hex(6); + if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and - + parse_error(format("unknown character class escape '{}'", cp)); + return cp; + }; + if (cp == '\\') { auto it = find_if(character_class_escapes, @@ -481,14 +495,7 @@ private: continue; } else // its an escaped character - { - cp = *m_pos++; - auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; }); - if (it != std::end(control_escapes)) - cp = it->value; - else if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and - - parse_error(format("unknown character class escape '{}'", cp)); - } + cp = read_escaped_char(); } CharacterClass::Range range = { cp, cp }; @@ -498,7 +505,10 @@ private: break; if (*m_pos != ']') { - range.max = *m_pos++; + cp = *m_pos++; + if (cp == '\\') + cp = read_escaped_char(); + range.max = cp; if (range.min > range.max) parse_error("invalid range specified"); } @@ -1522,13 +1532,32 @@ auto test_regex = UnitTest{[]{ kak_assert(vm.exec("bCa")); } + { + TestVM<> vm{R"([\t-\r]+)"}; + kak_assert(vm.exec("\t\n\v\f\r")); + } + + { + TestVM<> vm{R"([^\x00-\x7F]+)"}; + kak_assert(not vm.exec("ascii")); + kak_assert(vm.exec("โ†โ†‘โ†’โ†“")); + kak_assert(vm.exec("๐Ÿ˜„๐Ÿ˜Š๐Ÿ˜‰")); + } + + { + TestVM<> vm{R"([^\u000000-\u00ffff]+)"}; + kak_assert(not vm.exec("ascii")); + kak_assert(not vm.exec("โ†โ†‘โ†’โ†“")); + kak_assert(vm.exec("๐Ÿ˜„๐Ÿ˜Š๐Ÿ˜‰")); + } + { TestVM vm{R"(ะด)"}; kak_assert(vm.exec("ะด", RegexExecFlags::None)); } { - TestVM<> vm{R"(\0\x0A\u260e\u260F)"}; + TestVM<> vm{R"(\0\x0A\u00260e\u00260F)"}; const char str[] = "\0\nโ˜Žโ˜"; // work around the null byte in the literal kak_assert(vm.exec({str, str + sizeof(str)-1})); }