diff --git a/compiler/parser/src/tokenizer/lexer.rs b/compiler/parser/src/tokenizer/lexer.rs index c9281e8610..45ced2d423 100644 --- a/compiler/parser/src/tokenizer/lexer.rs +++ b/compiler/parser/src/tokenizer/lexer.rs @@ -41,6 +41,14 @@ fn eat_identifier(input: &mut Peekable>) -> Option bool { + let i = c as u32; + return (0x202A <= i && i <= 0x202E) || (0x2066 <= i && i <= 0x2069); +} + impl Token { // Eats the parts of the unicode character after \u. fn eat_unicode_char(input: &mut Peekable>) -> Result<(usize, Char)> { @@ -198,6 +206,9 @@ impl Token { let mut len = 0; while let Some(c) = input.peek() { + if is_bidi_override(*c) { + return Err(ParserError::lexer_bidi_override().into()); + } if c == &'"' { break; } @@ -215,8 +226,14 @@ impl Token { Some('\'') => { input.next(); - let (len, character) = Self::eat_char(&mut input)?; + match input.peek() { + Some(c) if is_bidi_override(*c) => { + return Err(ParserError::lexer_bidi_override().into()); + } + _ => {} + } + let (len, character) = Self::eat_char(&mut input)?; if input.next_if_eq(&'\'').is_some() { input.next(); return Ok((len + 2, Token::CharLit(character))); @@ -294,6 +311,9 @@ impl Token { let mut comment = String::from("//"); while let Some(c) = input.next_if(|c| c != &'\n') { + if is_bidi_override(c) { + return Err(ParserError::lexer_bidi_override().into()); + } comment.push(c); } @@ -312,6 +332,9 @@ impl Token { let mut ended = false; while let Some(c) = input.next() { + if is_bidi_override(c) { + return Err(ParserError::lexer_bidi_override().into()); + } comment.push(c); if c == '*' && input.next_if_eq(&'/').is_some() { comment.push('/'); diff --git a/docs/grammar/README.md b/docs/grammar/README.md index 95eb46c8cc..5f9786f9be 100644 --- a/docs/grammar/README.md +++ b/docs/grammar/README.md @@ -21,11 +21,26 @@ along with the Leo library. If not, see . Lexical Grammar --------------- + +```abnf +ascii = %x0-7F +``` + + +```abnf +safe-nonascii = %x80-2029 / %x202F-2065 / %x2070-D7FF / %xE000-10FFFF + ; excludes bidi overrides and high/low surrogates +``` + ```abnf -character = %x0-D7FF / %xE000-10FFFF ; Unicode code points decoded from UTF-8 +character = ascii / safe-nonascii + ; Unicode code points decoded from UTF-8 ``` +Go to: _[ascii](#user-content-ascii), [safe-nonascii](#user-content-safe-nonascii)_; + + ```abnf horizontal-tab = %x9 ; @@ -58,33 +73,48 @@ single-quote = %x27 ; ' ```abnf -not-star = %x0-29 / %x2B-D7FF / %xE000-10FFFF ; anything but * +not-star = %x0-29 / %x2B-7F / safe-nonascii ; anything but * ``` +Go to: _[safe-nonascii](#user-content-safe-nonascii)_; + + ```abnf -not-star-or-slash = %x0-29 / %x2B-2E / %x30-D7FF / %xE000-10FFFF +not-star-or-slash = %x0-29 / %x2B-2E / %x30-7F / safe-nonascii ; anything but * or / ``` +Go to: _[safe-nonascii](#user-content-safe-nonascii)_; + + ```abnf -not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-D7FF / %xE000-10FFFF +not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-7F / safe-nonascii ; anything but or ``` +Go to: _[safe-nonascii](#user-content-safe-nonascii)_; + + ```abnf -not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-D7FF / %xE000-10FFFF +not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-7F / safe-nonascii ; anything but " or \ ``` +Go to: _[safe-nonascii](#user-content-safe-nonascii)_; + + ```abnf -not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-D7FF / %xE000-10FFFF +not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-7F / safe-nonascii ; anything but ' or \ ``` +Go to: _[safe-nonascii](#user-content-safe-nonascii)_; + + ```abnf line-terminator = line-feed / carriage-return / carriage-return line-feed diff --git a/docs/grammar/abnf-grammar.txt b/docs/grammar/abnf-grammar.txt index 40773688e8..d1b7a5c14b 100644 --- a/docs/grammar/abnf-grammar.txt +++ b/docs/grammar/abnf-grammar.txt @@ -19,7 +19,13 @@ ; Lexical Grammar ; --------------- -character = %x0-D7FF / %xE000-10FFFF ; Unicode code points decoded from UTF-8 +ascii = %x0-7F + +safe-nonascii = %x80-2029 / %x202F-2065 / %x2070-D7FF / %xE000-10FFFF + ; excludes bidi overrides and high/low surrogates + +character = ascii / safe-nonascii + ; Unicode code points decoded from UTF-8 horizontal-tab = %x9 ; @@ -33,18 +39,18 @@ double-quote = %x22 ; " single-quote = %x27 ; ' -not-star = %x0-29 / %x2B-D7FF / %xE000-10FFFF ; anything but * +not-star = %x0-29 / %x2B-7F / safe-nonascii ; anything but * -not-star-or-slash = %x0-29 / %x2B-2E / %x30-D7FF / %xE000-10FFFF +not-star-or-slash = %x0-29 / %x2B-2E / %x30-7F / safe-nonascii ; anything but * or / -not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-D7FF / %xE000-10FFFF +not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-7F / safe-nonascii ; anything but or -not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-D7FF / %xE000-10FFFF +not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-7F / safe-nonascii ; anything but " or \ -not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-D7FF / %xE000-10FFFF +not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-7F / safe-nonascii ; anything but ' or \ line-terminator = line-feed / carriage-return / carriage-return line-feed diff --git a/leo/errors/src/parser/parser_errors.rs b/leo/errors/src/parser/parser_errors.rs index 741f7ccd9b..15f7ebe3fd 100644 --- a/leo/errors/src/parser/parser_errors.rs +++ b/leo/errors/src/parser/parser_errors.rs @@ -374,4 +374,13 @@ create_errors!( msg: "`constant` is preferred over `const` for function parameters to indicate a R1CS constant.", help: None, } + + /// For when the lexer encountered a bidi override character + @backtraced + lexer_bidi_override { + args: (), + msg: "Unicode bidi override code point encountered.", + help: None, + } + ); diff --git a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out index f429609540..eefa9cf0ea 100644 --- a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out @@ -48,3 +48,4 @@ outputs: - "Error [EPAR0370033]: The escaped unicode char `1234567890` is not within valid length of [1, 6]." - "Error [EPAR0370026]: Expected a closed char but found `򻮻`." - "Error [EPAR0370026]: Expected a closed char but found `😭`." + - "Error [EPAR0370043]: Unicode bidi override code point encountered." diff --git a/tests/expectations/parser/parser/expression/literal/string_fail.leo.out b/tests/expectations/parser/parser/expression/literal/string_fail.leo.out index 331ee1f2d9..0e4d924ff3 100644 --- a/tests/expectations/parser/parser/expression/literal/string_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/string_fail.leo.out @@ -15,3 +15,4 @@ outputs: - "Error [EPAR0370032]: There was no closing `}` after a escaped unicode `af🦀\"`." - "Error [EPAR0370025]: Expected a closed string but found `\"`." - "Error [EPAR0370025]: Expected a closed string but found `⭇😍;`." + - "Error [EPAR0370043]: Unicode bidi override code point encountered." diff --git a/tests/expectations/parser/parser/program/bidi_comment.leo.out b/tests/expectations/parser/parser/program/bidi_comment.leo.out new file mode 100644 index 0000000000..61ceb26700 --- /dev/null +++ b/tests/expectations/parser/parser/program/bidi_comment.leo.out @@ -0,0 +1,5 @@ +--- +namespace: Parse +expectation: Fail +outputs: + - "Error [EPAR0370043]: Unicode bidi override code point encountered." diff --git a/tests/expectations/parser/parser/program/bidi_comment_2.leo.out b/tests/expectations/parser/parser/program/bidi_comment_2.leo.out new file mode 100644 index 0000000000..61ceb26700 --- /dev/null +++ b/tests/expectations/parser/parser/program/bidi_comment_2.leo.out @@ -0,0 +1,5 @@ +--- +namespace: Parse +expectation: Fail +outputs: + - "Error [EPAR0370043]: Unicode bidi override code point encountered." diff --git a/tests/expectations/parser/parser/statement/console_fail.leo.out b/tests/expectations/parser/parser/statement/console_fail.leo.out index b06712cbf1..759aa5bd62 100644 --- a/tests/expectations/parser/parser/statement/console_fail.leo.out +++ b/tests/expectations/parser/parser/statement/console_fail.leo.out @@ -2,5 +2,6 @@ namespace: ParseStatement expectation: Fail outputs: + - "Error [EPAR0370043]: Unicode bidi override code point encountered." - "Error [EPAR0370009]: unexpected string: expected 'formatted string', got '1'\n --> test:1:13\n |\n 1 | console.log(1);\n | ^" - "Error [EPAR0370007]: unexpected identifier: expected 'assert', 'error', 'log' -- got 'test'\n --> test:1:9\n |\n 1 | console.test();\n | ^^^^" diff --git a/tests/parser/expression/literal/char_fail.leo b/tests/parser/expression/literal/char_fail.leo index fbd343b4e5..166423f739 100644 --- a/tests/parser/expression/literal/char_fail.leo +++ b/tests/parser/expression/literal/char_fail.leo @@ -56,3 +56,4 @@ expectation: Fail '\u{bbbbb}\u{aaaa}' '😭😂😘' +'⁩' diff --git a/tests/parser/expression/literal/string_fail.leo b/tests/parser/expression/literal/string_fail.leo index dad1c2448c..4aa792b90a 100644 --- a/tests/parser/expression/literal/string_fail.leo +++ b/tests/parser/expression/literal/string_fail.leo @@ -27,4 +27,6 @@ expectation: Fail "\" -"⭇😍; \ No newline at end of file +"⭇😍; + +"2066:⁦" diff --git a/tests/parser/program/bidi_comment.leo b/tests/parser/program/bidi_comment.leo new file mode 100644 index 0000000000..cdeb79d904 --- /dev/null +++ b/tests/parser/program/bidi_comment.leo @@ -0,0 +1,9 @@ +/* +namespace: Parse +expectation: Fail +*/ + +function main() { + // 202E‮<-here + let x = 1u8; +} diff --git a/tests/parser/program/bidi_comment_2 b/tests/parser/program/bidi_comment_2 new file mode 100644 index 0000000000..49cd565a87 --- /dev/null +++ b/tests/parser/program/bidi_comment_2 @@ -0,0 +1,9 @@ +/* +namespace: Parse +expectation: Fail +*/ + +function main() { + /* next line starts with 2069 +⁩*/ +} diff --git a/tests/parser/program/bidi_comment_2.leo b/tests/parser/program/bidi_comment_2.leo new file mode 100644 index 0000000000..49cd565a87 --- /dev/null +++ b/tests/parser/program/bidi_comment_2.leo @@ -0,0 +1,9 @@ +/* +namespace: Parse +expectation: Fail +*/ + +function main() { + /* next line starts with 2069 +⁩*/ +} diff --git a/tests/parser/statement/console_fail.leo b/tests/parser/statement/console_fail.leo index bb9fff5836..6bf4371c67 100644 --- a/tests/parser/statement/console_fail.leo +++ b/tests/parser/statement/console_fail.leo @@ -3,6 +3,8 @@ namespace: ParseStatement expectation: Fail */ +console.error("‪"); // bidi override + console.log(1); -console.test(); \ No newline at end of file +console.test();