Merge pull request #1751 from AleoHQ/abnf-bidi-overrides

[ABNF] disallow bidi override codepoints
This commit is contained in:
Collin Chin 2022-04-20 09:34:00 -07:00 committed by GitHub
commit 23fca6af47
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 128 additions and 15 deletions

View File

@ -41,6 +41,14 @@ fn eat_identifier(input: &mut Peekable<impl Iterator<Item = char>>) -> Option<St
Some(ident) Some(ident)
} }
///
/// Checks if a char is a Unicode Bidirectional Override code point
///
fn is_bidi_override(c: char) -> bool {
let i = c as u32;
return (0x202A <= i && i <= 0x202E) || (0x2066 <= i && i <= 0x2069);
}
impl Token { impl Token {
// Eats the parts of the unicode character after \u. // Eats the parts of the unicode character after \u.
fn eat_unicode_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> { fn eat_unicode_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
@ -198,6 +206,9 @@ impl Token {
let mut len = 0; let mut len = 0;
while let Some(c) = input.peek() { while let Some(c) = input.peek() {
if is_bidi_override(*c) {
return Err(ParserError::lexer_bidi_override().into());
}
if c == &'"' { if c == &'"' {
break; break;
} }
@ -215,8 +226,14 @@ impl Token {
Some('\'') => { Some('\'') => {
input.next(); input.next();
let (len, character) = Self::eat_char(&mut input)?; match input.peek() {
Some(c) if is_bidi_override(*c) => {
return Err(ParserError::lexer_bidi_override().into());
}
_ => {}
}
let (len, character) = Self::eat_char(&mut input)?;
if input.next_if_eq(&'\'').is_some() { if input.next_if_eq(&'\'').is_some() {
input.next(); input.next();
return Ok((len + 2, Token::CharLit(character))); return Ok((len + 2, Token::CharLit(character)));
@ -294,6 +311,9 @@ impl Token {
let mut comment = String::from("//"); let mut comment = String::from("//");
while let Some(c) = input.next_if(|c| c != &'\n') { while let Some(c) = input.next_if(|c| c != &'\n') {
if is_bidi_override(c) {
return Err(ParserError::lexer_bidi_override().into());
}
comment.push(c); comment.push(c);
} }
@ -312,6 +332,9 @@ impl Token {
let mut ended = false; let mut ended = false;
while let Some(c) = input.next() { while let Some(c) = input.next() {
if is_bidi_override(c) {
return Err(ParserError::lexer_bidi_override().into());
}
comment.push(c); comment.push(c);
if c == '*' && input.next_if_eq(&'/').is_some() { if c == '*' && input.next_if_eq(&'/').is_some() {
comment.push('/'); comment.push('/');

View File

@ -21,11 +21,26 @@ along with the Leo library. If not, see <https://www.gnu.org/licenses/>.
Lexical Grammar Lexical Grammar
--------------- ---------------
<a name="ascii"></a>
```abnf
ascii = %x0-7F
```
<a name="safe-nonascii"></a>
```abnf
safe-nonascii = %x80-2029 / %x202F-2065 / %x2070-D7FF / %xE000-10FFFF
; excludes bidi overrides and high/low surrogates
```
<a name="character"></a> <a name="character"></a>
```abnf ```abnf
character = %x0-D7FF / %xE000-10FFFF ; Unicode code points decoded from UTF-8 character = ascii / safe-nonascii
; Unicode code points decoded from UTF-8
``` ```
Go to: _[ascii](#user-content-ascii), [safe-nonascii](#user-content-safe-nonascii)_;
<a name="horizontal-tab"></a> <a name="horizontal-tab"></a>
```abnf ```abnf
horizontal-tab = %x9 ; <HT> horizontal-tab = %x9 ; <HT>
@ -58,33 +73,48 @@ single-quote = %x27 ; '
<a name="not-star"></a> <a name="not-star"></a>
```abnf ```abnf
not-star = %x0-29 / %x2B-D7FF / %xE000-10FFFF ; anything but * not-star = %x0-29 / %x2B-7F / safe-nonascii ; anything but *
``` ```
Go to: _[safe-nonascii](#user-content-safe-nonascii)_;
<a name="not-star-or-slash"></a> <a name="not-star-or-slash"></a>
```abnf ```abnf
not-star-or-slash = %x0-29 / %x2B-2E / %x30-D7FF / %xE000-10FFFF not-star-or-slash = %x0-29 / %x2B-2E / %x30-7F / safe-nonascii
; anything but * or / ; anything but * or /
``` ```
Go to: _[safe-nonascii](#user-content-safe-nonascii)_;
<a name="not-line-feed-or-carriage-return"></a> <a name="not-line-feed-or-carriage-return"></a>
```abnf ```abnf
not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-D7FF / %xE000-10FFFF not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-7F / safe-nonascii
; anything but <LF> or <CR> ; anything but <LF> or <CR>
``` ```
Go to: _[safe-nonascii](#user-content-safe-nonascii)_;
<a name="not-double-quote-or-backslash"></a> <a name="not-double-quote-or-backslash"></a>
```abnf ```abnf
not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-D7FF / %xE000-10FFFF not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-7F / safe-nonascii
; anything but " or \ ; anything but " or \
``` ```
Go to: _[safe-nonascii](#user-content-safe-nonascii)_;
<a name="not-single-quote-or-backslash"></a> <a name="not-single-quote-or-backslash"></a>
```abnf ```abnf
not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-D7FF / %xE000-10FFFF not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-7F / safe-nonascii
; anything but ' or \ ; anything but ' or \
``` ```
Go to: _[safe-nonascii](#user-content-safe-nonascii)_;
<a name="line-terminator"></a> <a name="line-terminator"></a>
```abnf ```abnf
line-terminator = line-feed / carriage-return / carriage-return line-feed line-terminator = line-feed / carriage-return / carriage-return line-feed

View File

@ -19,7 +19,13 @@
; Lexical Grammar ; Lexical Grammar
; --------------- ; ---------------
character = %x0-D7FF / %xE000-10FFFF ; Unicode code points decoded from UTF-8 ascii = %x0-7F
safe-nonascii = %x80-2029 / %x202F-2065 / %x2070-D7FF / %xE000-10FFFF
; excludes bidi overrides and high/low surrogates
character = ascii / safe-nonascii
; Unicode code points decoded from UTF-8
horizontal-tab = %x9 ; <HT> horizontal-tab = %x9 ; <HT>
@ -33,18 +39,18 @@ double-quote = %x22 ; "
single-quote = %x27 ; ' single-quote = %x27 ; '
not-star = %x0-29 / %x2B-D7FF / %xE000-10FFFF ; anything but * not-star = %x0-29 / %x2B-7F / safe-nonascii ; anything but *
not-star-or-slash = %x0-29 / %x2B-2E / %x30-D7FF / %xE000-10FFFF not-star-or-slash = %x0-29 / %x2B-2E / %x30-7F / safe-nonascii
; anything but * or / ; anything but * or /
not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-D7FF / %xE000-10FFFF not-line-feed-or-carriage-return = %x0-9 / %xB-C / %xE-7F / safe-nonascii
; anything but <LF> or <CR> ; anything but <LF> or <CR>
not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-D7FF / %xE000-10FFFF not-double-quote-or-backslash = %x0-21 / %x23-5B / %x5D-7F / safe-nonascii
; anything but " or \ ; anything but " or \
not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-D7FF / %xE000-10FFFF not-single-quote-or-backslash = %x0-26 / %x28-5B / %x5D-7F / safe-nonascii
; anything but ' or \ ; anything but ' or \
line-terminator = line-feed / carriage-return / carriage-return line-feed line-terminator = line-feed / carriage-return / carriage-return line-feed

View File

@ -374,4 +374,13 @@ create_errors!(
msg: "`constant` is preferred over `const` for function parameters to indicate a R1CS constant.", msg: "`constant` is preferred over `const` for function parameters to indicate a R1CS constant.",
help: None, help: None,
} }
/// For when the lexer encountered a bidi override character
@backtraced
lexer_bidi_override {
args: (),
msg: "Unicode bidi override code point encountered.",
help: None,
}
); );

View File

@ -48,3 +48,4 @@ outputs:
- "Error [EPAR0370033]: The escaped unicode char `1234567890` is not within valid length of [1, 6]." - "Error [EPAR0370033]: The escaped unicode char `1234567890` is not within valid length of [1, 6]."
- "Error [EPAR0370026]: Expected a closed char but found `򻮻`." - "Error [EPAR0370026]: Expected a closed char but found `򻮻`."
- "Error [EPAR0370026]: Expected a closed char but found `😭`." - "Error [EPAR0370026]: Expected a closed char but found `😭`."
- "Error [EPAR0370043]: Unicode bidi override code point encountered."

View File

@ -15,3 +15,4 @@ outputs:
- "Error [EPAR0370032]: There was no closing `}` after a escaped unicode `af🦀\"`." - "Error [EPAR0370032]: There was no closing `}` after a escaped unicode `af🦀\"`."
- "Error [EPAR0370025]: Expected a closed string but found `\"`." - "Error [EPAR0370025]: Expected a closed string but found `\"`."
- "Error [EPAR0370025]: Expected a closed string but found `⭇😍;`." - "Error [EPAR0370025]: Expected a closed string but found `⭇😍;`."
- "Error [EPAR0370043]: Unicode bidi override code point encountered."

View File

@ -0,0 +1,5 @@
---
namespace: Parse
expectation: Fail
outputs:
- "Error [EPAR0370043]: Unicode bidi override code point encountered."

View File

@ -0,0 +1,5 @@
---
namespace: Parse
expectation: Fail
outputs:
- "Error [EPAR0370043]: Unicode bidi override code point encountered."

View File

@ -2,5 +2,6 @@
namespace: ParseStatement namespace: ParseStatement
expectation: Fail expectation: Fail
outputs: outputs:
- "Error [EPAR0370043]: Unicode bidi override code point encountered."
- "Error [EPAR0370009]: unexpected string: expected 'formatted string', got '1'\n --> test:1:13\n |\n 1 | console.log(1);\n | ^" - "Error [EPAR0370009]: unexpected string: expected 'formatted string', got '1'\n --> test:1:13\n |\n 1 | console.log(1);\n | ^"
- "Error [EPAR0370007]: unexpected identifier: expected 'assert', 'error', 'log' -- got 'test'\n --> test:1:9\n |\n 1 | console.test();\n | ^^^^" - "Error [EPAR0370007]: unexpected identifier: expected 'assert', 'error', 'log' -- got 'test'\n --> test:1:9\n |\n 1 | console.test();\n | ^^^^"

View File

@ -56,3 +56,4 @@ expectation: Fail
'\u{bbbbb}\u{aaaa}' '\u{bbbbb}\u{aaaa}'
'😭😂😘' '😭😂😘'
''

View File

@ -27,4 +27,6 @@ expectation: Fail
"\" "\"
"⭇😍; "⭇😍;
"2066:"

View File

@ -0,0 +1,9 @@
/*
namespace: Parse
expectation: Fail
*/
function main() {
// 202E<-here
let x = 1u8;
}

View File

@ -0,0 +1,9 @@
/*
namespace: Parse
expectation: Fail
*/
function main() {
/* next line starts with 2069
*/
}

View File

@ -0,0 +1,9 @@
/*
namespace: Parse
expectation: Fail
*/
function main() {
/* next line starts with 2069
*/
}

View File

@ -3,6 +3,8 @@ namespace: ParseStatement
expectation: Fail expectation: Fail
*/ */
console.error(""); // bidi override
console.log(1); console.log(1);
console.test(); console.test();