From 1e1486f4efe7b34001edf09ab11c0fc921433c0a Mon Sep 17 00:00:00 2001 From: gluax Date: Wed, 19 May 2021 18:14:56 -0400 Subject: [PATCH] refactor for better errors, code shouldn't have been in the lexer --- parser/src/errors/syntax.rs | 40 +++++ parser/src/parser/expression.rs | 118 +++++++++++++- parser/src/tokenizer/lexer.rs | 144 +++--------------- parser/src/tokenizer/token.rs | 2 +- .../compiler/char/invalid_char.leo.out | 2 +- .../parser/expression/literal/char.leo.out | 16 +- .../expression/literal/char_fail.leo.out | 8 +- .../expression/literal/char_parse.leo.out | 18 +++ tests/parser/expression/literal/char.leo | 2 + tests/parser/expression/literal/char_fail.leo | 6 +- .../parser/expression/literal/char_parse.leo | 2 + 11 files changed, 216 insertions(+), 142 deletions(-) diff --git a/parser/src/errors/syntax.rs b/parser/src/errors/syntax.rs index 98fbc6bad8..b1062527e9 100644 --- a/parser/src/errors/syntax.rs +++ b/parser/src/errors/syntax.rs @@ -48,6 +48,46 @@ impl SyntaxError { SyntaxError::Error(FormattedError::new_from_span(message, span)) } + pub fn invalid_char(character: Vec, span: &Span) -> Self { + Self::new_from_span(format!("Invalid character '{:?}'", character), span) + } + + pub fn invalid_empty_char(span: &Span) -> Self { + Self::new_from_span("Empty character '' is not valid".to_string(), span) + } + + pub fn invalid_escaped_char(character: char, span: &Span) -> Self { + Self::new_from_span(format!("Invalid escape character '\\{}'", character), span) + } + + pub fn invalid_hex_char(character: Vec, span: &Span) -> Self { + Self::new_from_span(format!("Invalid singe hex character '\\x{:?}'", character), span) + } + + pub fn invalid_hex_single_char(character: char, span: &Span) -> Self { + Self::new_from_span( + format!( + "Invalid singe hex character '\\x{}', expected '\\x0{}", + character, character + ), + span, + ) + } + + pub fn invalid_unicode_char(character: Vec, escaped: bool, span: &Span) -> Self { + if escaped { + return Self::new_from_span( + format!("Invalid unicode escaped character '\\u{{{:?}}}'", character), + span, + ); + } + + Self::new_from_span( + format!("Invalid unicode symbol character '\\u{{{:?}}}'", character), + span, + ) + } + pub fn invalid_import_list(span: &Span) -> Self { Self::new_from_span("Cannot import empty list".to_string(), span) } diff --git a/parser/src/parser/expression.rs b/parser/src/parser/expression.rs index 40ca52e2bd..beeec9541b 100644 --- a/parser/src/parser/expression.rs +++ b/parser/src/parser/expression.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with the Leo library. If not, see . -use tendril::format_tendril; +use tendril::{format_tendril, StrTendril}; use super::*; @@ -643,6 +643,120 @@ impl ParserContext { } } + /// + /// Returns a character if it is a valid character that can be parsed. + /// + fn parse_char(&mut self, input_tendril: StrTendril, span: Span) -> SyntaxResult { + if input_tendril.is_empty() { + return Err(SyntaxError::invalid_empty_char(&span)); + } + + let input = input_tendril[..].as_bytes(); + let mut i = 0; + let mut escaped = false; + let mut hex = false; + let mut unicode = false; + let mut characters: Vec = vec![]; + + while i < input.len() { + if !escaped { + if input[i] == b'{' { + i += 1; + characters.clear(); + continue; + } + + if input[i] == b'}' { + i += 1; + continue; + } + } else { + escaped = false; + characters.clear(); + + match input[i] { + b'0' => characters.push(0), + b't' => characters.push(9), + b'n' => characters.push(10), + b'r' => characters.push(13), + b'\"' => characters.push(34), + b'\'' => characters.push(39), + b'\\' => characters.push(92), + b'x' => { + hex = true; + + i += 1; + continue; + } + b'u' => { + unicode = true; + } + _ => { + return Err(SyntaxError::invalid_escaped_char(input[i] as char, &span)); + } + } + + i += 1; + + continue; + } + + if input[i] == b'\\' { + escaped = true; + } + + characters.push(input[i]); + i += 1; + } + + return match characters.len() { + 1 | 2 | 3 | 4 | 5 | 6 if unicode => { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(hex) = u32::from_str_radix(&string, 16) { + if hex <= 0x10FFFF { + if let Some(unicode_char) = std::char::from_u32(hex) { + return Ok(Expression::Value(ValueExpression::Char(unicode_char, span))); + } + } + } + } + + Err(SyntaxError::invalid_unicode_char(characters, true, &span)) + } + 1 => { + if hex { + return Err(SyntaxError::invalid_hex_single_char(characters[0] as char, &span)); + } else if escaped { + return Err(SyntaxError::invalid_escaped_char(characters[0] as char, &span)); + } + + Ok(Expression::Value(ValueExpression::Char(characters[0] as char, span))) + } + 2 if hex => { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(number) = u8::from_str_radix(&string, 16) { + if number <= 127 { + return Ok(Expression::Value(ValueExpression::Char(number as char, span))); + } + } + } + + Err(SyntaxError::invalid_hex_char(characters, &span)) + } + 3 | 4 => { + // direct unicode symbol + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Some(character) = string.chars().next() { + return Ok(Expression::Value(ValueExpression::Char(character, span))); + } + } + + Err(SyntaxError::invalid_unicode_char(characters, false, &span)) + } + _ => Err(SyntaxError::invalid_char(characters, &span)), + }; + } + /// /// Returns an [`Expression`] AST node if the next token is a primary expression: /// - Literals: field, group, unsigned integer, signed integer, boolean, address @@ -689,7 +803,7 @@ impl ParserContext { Token::True => Expression::Value(ValueExpression::Boolean("true".into(), span)), Token::False => Expression::Value(ValueExpression::Boolean("false".into(), span)), Token::AddressLit(value) => Expression::Value(ValueExpression::Address(value, span)), - Token::CharLit(value) => Expression::Value(ValueExpression::Char(value, span)), + Token::CharLit(value) => self.parse_char(value, span)?, Token::LeftParen => self.parse_tuple_expression(&span)?, Token::LeftSquare => self.parse_array_expression(&span)?, Token::Ident(name) => { diff --git a/parser/src/tokenizer/lexer.rs b/parser/src/tokenizer/lexer.rs index 2a2850a71c..178a4432ad 100644 --- a/parser/src/tokenizer/lexer.rs +++ b/parser/src/tokenizer/lexer.rs @@ -61,132 +61,6 @@ fn eat_identifier(input_tendril: &StrTendril) -> Option { } impl Token { - /// - /// Returns a new `StrTendril` string if an character can be eaten, otherwise returns [`None`]. - /// - fn eat_char(input_tendril: &StrTendril) -> (usize, Option) { - // Probably better to move this logic to a parse_char. - // Would give better errors, and isolates logic from lexer. - // Lexer can just return content between single quotes. - if input_tendril.is_empty() { - return (0, None); - } - - let input = input_tendril[..].as_bytes(); - let mut i = 1; - let mut escaped = false; - let mut hex = false; - let mut unicode = false; - let mut last = false; - let mut characters: Vec = vec![]; - - while i < input.len() { - if !escaped { - if input[i] == b'\'' { - last = true; - i += 1; - break; - } - - if input[i] == b'{' { - i += 1; - characters.clear(); - continue; - } - - if input[i] == b'}' { - i += 1; - continue; - } - } else { - escaped = false; - characters.clear(); - - match input[i] { - b'0' => characters.push(0), - b't' => characters.push(9), - b'n' => characters.push(10), - b'r' => characters.push(13), - b'\"' => characters.push(34), - b'\'' => characters.push(39), - b'\\' => characters.push(92), - b'x' => { - hex = true; - - i += 1; - continue; - } - b'u' => { - unicode = true; - } - _ => { - return (0, None); - } - } - - i += 1; - - continue; - } - - if input[i] == b'\\' { - escaped = true; - } - - characters.push(input[i]); - i += 1; - } - - if !last { - return (0, None); - } - - return match characters.len() { - 1 | 2 | 3 | 4 | 5 if unicode => { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(hex) = u32::from_str_radix(&string, 16) { - if hex <= 0x10FFFF { - if let Some(unicode_char) = std::char::from_u32(hex) { - return (i, Some(Token::CharLit(unicode_char))); - } - } - } - } - - (0, None) - } - 1 => { - if hex { - return (0, None); - } - - (i, Some(Token::CharLit(characters[0] as char))) - } - 2 if hex => { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(number) = u8::from_str_radix(&string, 16) { - if number <= 127 { - return (i, Some(Token::CharLit(number as char))); - } - } - } - - (0, None) - } - 3 | 4 => { - // direct unicode symbol - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Some(character) = string.chars().next() { - return (i, Some(Token::CharLit(character))); - } - } - - (0, None) - } - _ => (0, None), - }; - } - /// /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`]. /// An integer can be eaten if its bytes are at the front of the given `input_tendril` string. @@ -277,7 +151,23 @@ impl Token { return (i + 1, Some(Token::FormatString(segments))); } b'\'' => { - return Self::eat_char(&input_tendril); + let mut i = 1; + let mut end = false; + + while i < input.len() { + if input[i] == b'\'' { + end = true; + break; + } + + i += 1; + } + + if !end { + return (0, None); + } + + return (i + 1, Some(Token::CharLit(input_tendril.subtendril(1, (i - 1) as u32)))); } x if x.is_ascii_digit() => { return Self::eat_integer(&input_tendril); diff --git a/parser/src/tokenizer/token.rs b/parser/src/tokenizer/token.rs index d7da426589..5863041ae9 100644 --- a/parser/src/tokenizer/token.rs +++ b/parser/src/tokenizer/token.rs @@ -47,7 +47,7 @@ pub enum Token { True, False, AddressLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), - CharLit(char), + CharLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), At, diff --git a/tests/expectations/compiler/compiler/char/invalid_char.leo.out b/tests/expectations/compiler/compiler/char/invalid_char.leo.out index c529e663ae..8613ddc150 100644 --- a/tests/expectations/compiler/compiler/char/invalid_char.leo.out +++ b/tests/expectations/compiler/compiler/char/invalid_char.leo.out @@ -2,4 +2,4 @@ namespace: Compile expectation: Fail outputs: - - " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^\n |\n = unexpected token: '''" + - " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^^\n |\n = Empty character '' is not valid" diff --git a/tests/expectations/parser/parser/expression/literal/char.leo.out b/tests/expectations/parser/parser/expression/literal/char.leo.out index d6e0fd1fc1..0ce29e5695 100644 --- a/tests/expectations/parser/parser/expression/literal/char.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char.leo.out @@ -4,12 +4,14 @@ expectation: Pass outputs: - "'a' @ 1:1-4" - "'Z' @ 1:1-4" - - "'\"' @ 1:1-5" - - "'' @ 1:1-5" - - "'' @ 1:1-5" - - "'\u0000' @ 1:1-5" - - "'❤' @ 1:1-11" - - "'の' @ 1:1-11" + - "'\\\"' @ 1:1-5" + - "'\\t' @ 1:1-5" + - "'\\r' @ 1:1-5" + - "'\\0' @ 1:1-5" + - "'\\u{2764}' @ 1:1-11" + - "'\\u{306E}' @ 1:1-11" + - "'\\u{10FFFF}' @ 1:1-13" - "'❤' @ 1:1-6" - "'の' @ 1:1-6" - - "'*' @ 1:1-7" + - "'\\x0F' @ 1:1-7" + - "'\\x2A' @ 1:1-7" diff --git a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out index fb3cc4bdcc..4c71c693af 100644 --- a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out @@ -2,7 +2,9 @@ namespace: ParseExpression expectation: Fail outputs: - - " --> test:1:1\n |\n 1 | '\\'\n | ^\n |\n = unexpected token: '''" + - " --> test:1:1\n |\n 1 | '\\'\n | ^^^\n |\n = Invalid escape character '\\\\'" - " --> test:1:1\n |\n 1 | 'a\n | ^\n |\n = unexpected token: '''" - - " --> test:1:1\n |\n 1 | ''\n | ^\n |\n = unexpected token: '''" - - " --> test:1:1\n |\n 1 | '\\x9'\n | ^\n |\n = unexpected token: '''" + - " --> test:1:1\n |\n 1 | ''\n | ^^\n |\n = Empty character '' is not valid" + - " --> test:1:1\n |\n 1 | '\\x9'\n | ^^^^^\n |\n = Invalid singe hex character '\\x9', expected '\\x09" + - " --> test:1:1\n |\n 1 | '\\x80'\n | ^^^^^^\n |\n = Invalid singe hex character '\\x[56, 48]'" + - " --> test:1:1\n |\n 1 | '\\u{9999999}'\n | ^^^^^^^^^^^^^\n |\n = Invalid character '[57, 57, 57, 57, 57, 57, 57]'" diff --git a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out index 3b080cd0a6..5ed19fa02e 100644 --- a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out @@ -74,6 +74,15 @@ outputs: col_stop: 11 path: test content: "'\\u{306E}'" + - Value: + Char: + - 􏿿 + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 13 + path: test + content: "'\\u{10FFFF}'" - Value: Char: - ❤ @@ -92,6 +101,15 @@ outputs: col_stop: 6 path: test content: "'の'" + - Value: + Char: + - "\u000f" + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 7 + path: test + content: "'\\x0F'" - Value: Char: - "*" diff --git a/tests/parser/expression/literal/char.leo b/tests/parser/expression/literal/char.leo index 5ea47f7dbf..71babf34f3 100644 --- a/tests/parser/expression/literal/char.leo +++ b/tests/parser/expression/literal/char.leo @@ -11,6 +11,8 @@ expectation: Pass '\0' '\u{2764}' '\u{306E}' +'\u{10FFFF}' '❤' 'の' +'\x0F' '\x2A' \ No newline at end of file diff --git a/tests/parser/expression/literal/char_fail.leo b/tests/parser/expression/literal/char_fail.leo index 565c6f3922..5991d04148 100644 --- a/tests/parser/expression/literal/char_fail.leo +++ b/tests/parser/expression/literal/char_fail.leo @@ -9,4 +9,8 @@ expectation: Fail '' -'\x9' \ No newline at end of file +'\x9' + +'\x80' + +'\u{9999999}' \ No newline at end of file diff --git a/tests/parser/expression/literal/char_parse.leo b/tests/parser/expression/literal/char_parse.leo index 515f6b10f3..3c22c813ce 100644 --- a/tests/parser/expression/literal/char_parse.leo +++ b/tests/parser/expression/literal/char_parse.leo @@ -11,6 +11,8 @@ expectation: Pass '\0' '\u{2764}' '\u{306E}' +'\u{10FFFF}' '❤' 'の' +'\x0F' '\x2A' \ No newline at end of file