From cce0f597dea4d432e5915c988286d11bf6b97915 Mon Sep 17 00:00:00 2001 From: gluax Date: Fri, 21 May 2021 12:33:39 -0400 Subject: [PATCH] Revert "refactor for better errors, code shouldn't have been in the lexer" This reverts commit 1e1486f4efe7b34001edf09ab11c0fc921433c0a. --- parser/src/errors/syntax.rs | 40 ----- parser/src/parser/expression.rs | 118 +------------- parser/src/tokenizer/lexer.rs | 144 +++++++++++++++--- parser/src/tokenizer/token.rs | 2 +- .../compiler/char/invalid_char.leo.out | 2 +- .../parser/expression/literal/char.leo.out | 16 +- .../expression/literal/char_fail.leo.out | 8 +- .../expression/literal/char_parse.leo.out | 18 --- tests/parser/expression/literal/char.leo | 2 - tests/parser/expression/literal/char_fail.leo | 6 +- .../parser/expression/literal/char_parse.leo | 2 - 11 files changed, 142 insertions(+), 216 deletions(-) diff --git a/parser/src/errors/syntax.rs b/parser/src/errors/syntax.rs index b1062527e9..98fbc6bad8 100644 --- a/parser/src/errors/syntax.rs +++ b/parser/src/errors/syntax.rs @@ -48,46 +48,6 @@ impl SyntaxError { SyntaxError::Error(FormattedError::new_from_span(message, span)) } - pub fn invalid_char(character: Vec, span: &Span) -> Self { - Self::new_from_span(format!("Invalid character '{:?}'", character), span) - } - - pub fn invalid_empty_char(span: &Span) -> Self { - Self::new_from_span("Empty character '' is not valid".to_string(), span) - } - - pub fn invalid_escaped_char(character: char, span: &Span) -> Self { - Self::new_from_span(format!("Invalid escape character '\\{}'", character), span) - } - - pub fn invalid_hex_char(character: Vec, span: &Span) -> Self { - Self::new_from_span(format!("Invalid singe hex character '\\x{:?}'", character), span) - } - - pub fn invalid_hex_single_char(character: char, span: &Span) -> Self { - Self::new_from_span( - format!( - "Invalid singe hex character '\\x{}', expected '\\x0{}", - character, character - ), - span, - ) - } - - pub fn invalid_unicode_char(character: Vec, escaped: bool, span: &Span) -> Self { - if escaped { - return Self::new_from_span( - format!("Invalid unicode escaped character '\\u{{{:?}}}'", character), - span, - ); - } - - Self::new_from_span( - format!("Invalid unicode symbol character '\\u{{{:?}}}'", character), - span, - ) - } - pub fn invalid_import_list(span: &Span) -> Self { Self::new_from_span("Cannot import empty list".to_string(), span) } diff --git a/parser/src/parser/expression.rs b/parser/src/parser/expression.rs index beeec9541b..40ca52e2bd 100644 --- a/parser/src/parser/expression.rs +++ b/parser/src/parser/expression.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with the Leo library. If not, see . -use tendril::{format_tendril, StrTendril}; +use tendril::format_tendril; use super::*; @@ -643,120 +643,6 @@ impl ParserContext { } } - /// - /// Returns a character if it is a valid character that can be parsed. - /// - fn parse_char(&mut self, input_tendril: StrTendril, span: Span) -> SyntaxResult { - if input_tendril.is_empty() { - return Err(SyntaxError::invalid_empty_char(&span)); - } - - let input = input_tendril[..].as_bytes(); - let mut i = 0; - let mut escaped = false; - let mut hex = false; - let mut unicode = false; - let mut characters: Vec = vec![]; - - while i < input.len() { - if !escaped { - if input[i] == b'{' { - i += 1; - characters.clear(); - continue; - } - - if input[i] == b'}' { - i += 1; - continue; - } - } else { - escaped = false; - characters.clear(); - - match input[i] { - b'0' => characters.push(0), - b't' => characters.push(9), - b'n' => characters.push(10), - b'r' => characters.push(13), - b'\"' => characters.push(34), - b'\'' => characters.push(39), - b'\\' => characters.push(92), - b'x' => { - hex = true; - - i += 1; - continue; - } - b'u' => { - unicode = true; - } - _ => { - return Err(SyntaxError::invalid_escaped_char(input[i] as char, &span)); - } - } - - i += 1; - - continue; - } - - if input[i] == b'\\' { - escaped = true; - } - - characters.push(input[i]); - i += 1; - } - - return match characters.len() { - 1 | 2 | 3 | 4 | 5 | 6 if unicode => { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(hex) = u32::from_str_radix(&string, 16) { - if hex <= 0x10FFFF { - if let Some(unicode_char) = std::char::from_u32(hex) { - return Ok(Expression::Value(ValueExpression::Char(unicode_char, span))); - } - } - } - } - - Err(SyntaxError::invalid_unicode_char(characters, true, &span)) - } - 1 => { - if hex { - return Err(SyntaxError::invalid_hex_single_char(characters[0] as char, &span)); - } else if escaped { - return Err(SyntaxError::invalid_escaped_char(characters[0] as char, &span)); - } - - Ok(Expression::Value(ValueExpression::Char(characters[0] as char, span))) - } - 2 if hex => { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(number) = u8::from_str_radix(&string, 16) { - if number <= 127 { - return Ok(Expression::Value(ValueExpression::Char(number as char, span))); - } - } - } - - Err(SyntaxError::invalid_hex_char(characters, &span)) - } - 3 | 4 => { - // direct unicode symbol - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Some(character) = string.chars().next() { - return Ok(Expression::Value(ValueExpression::Char(character, span))); - } - } - - Err(SyntaxError::invalid_unicode_char(characters, false, &span)) - } - _ => Err(SyntaxError::invalid_char(characters, &span)), - }; - } - /// /// Returns an [`Expression`] AST node if the next token is a primary expression: /// - Literals: field, group, unsigned integer, signed integer, boolean, address @@ -803,7 +689,7 @@ impl ParserContext { Token::True => Expression::Value(ValueExpression::Boolean("true".into(), span)), Token::False => Expression::Value(ValueExpression::Boolean("false".into(), span)), Token::AddressLit(value) => Expression::Value(ValueExpression::Address(value, span)), - Token::CharLit(value) => self.parse_char(value, span)?, + Token::CharLit(value) => Expression::Value(ValueExpression::Char(value, span)), Token::LeftParen => self.parse_tuple_expression(&span)?, Token::LeftSquare => self.parse_array_expression(&span)?, Token::Ident(name) => { diff --git a/parser/src/tokenizer/lexer.rs b/parser/src/tokenizer/lexer.rs index 178a4432ad..2a2850a71c 100644 --- a/parser/src/tokenizer/lexer.rs +++ b/parser/src/tokenizer/lexer.rs @@ -61,6 +61,132 @@ fn eat_identifier(input_tendril: &StrTendril) -> Option { } impl Token { + /// + /// Returns a new `StrTendril` string if an character can be eaten, otherwise returns [`None`]. + /// + fn eat_char(input_tendril: &StrTendril) -> (usize, Option) { + // Probably better to move this logic to a parse_char. + // Would give better errors, and isolates logic from lexer. + // Lexer can just return content between single quotes. + if input_tendril.is_empty() { + return (0, None); + } + + let input = input_tendril[..].as_bytes(); + let mut i = 1; + let mut escaped = false; + let mut hex = false; + let mut unicode = false; + let mut last = false; + let mut characters: Vec = vec![]; + + while i < input.len() { + if !escaped { + if input[i] == b'\'' { + last = true; + i += 1; + break; + } + + if input[i] == b'{' { + i += 1; + characters.clear(); + continue; + } + + if input[i] == b'}' { + i += 1; + continue; + } + } else { + escaped = false; + characters.clear(); + + match input[i] { + b'0' => characters.push(0), + b't' => characters.push(9), + b'n' => characters.push(10), + b'r' => characters.push(13), + b'\"' => characters.push(34), + b'\'' => characters.push(39), + b'\\' => characters.push(92), + b'x' => { + hex = true; + + i += 1; + continue; + } + b'u' => { + unicode = true; + } + _ => { + return (0, None); + } + } + + i += 1; + + continue; + } + + if input[i] == b'\\' { + escaped = true; + } + + characters.push(input[i]); + i += 1; + } + + if !last { + return (0, None); + } + + return match characters.len() { + 1 | 2 | 3 | 4 | 5 if unicode => { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(hex) = u32::from_str_radix(&string, 16) { + if hex <= 0x10FFFF { + if let Some(unicode_char) = std::char::from_u32(hex) { + return (i, Some(Token::CharLit(unicode_char))); + } + } + } + } + + (0, None) + } + 1 => { + if hex { + return (0, None); + } + + (i, Some(Token::CharLit(characters[0] as char))) + } + 2 if hex => { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(number) = u8::from_str_radix(&string, 16) { + if number <= 127 { + return (i, Some(Token::CharLit(number as char))); + } + } + } + + (0, None) + } + 3 | 4 => { + // direct unicode symbol + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Some(character) = string.chars().next() { + return (i, Some(Token::CharLit(character))); + } + } + + (0, None) + } + _ => (0, None), + }; + } + /// /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`]. /// An integer can be eaten if its bytes are at the front of the given `input_tendril` string. @@ -151,23 +277,7 @@ impl Token { return (i + 1, Some(Token::FormatString(segments))); } b'\'' => { - let mut i = 1; - let mut end = false; - - while i < input.len() { - if input[i] == b'\'' { - end = true; - break; - } - - i += 1; - } - - if !end { - return (0, None); - } - - return (i + 1, Some(Token::CharLit(input_tendril.subtendril(1, (i - 1) as u32)))); + return Self::eat_char(&input_tendril); } x if x.is_ascii_digit() => { return Self::eat_integer(&input_tendril); diff --git a/parser/src/tokenizer/token.rs b/parser/src/tokenizer/token.rs index 5863041ae9..d7da426589 100644 --- a/parser/src/tokenizer/token.rs +++ b/parser/src/tokenizer/token.rs @@ -47,7 +47,7 @@ pub enum Token { True, False, AddressLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), - CharLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), + CharLit(char), At, diff --git a/tests/expectations/compiler/compiler/char/invalid_char.leo.out b/tests/expectations/compiler/compiler/char/invalid_char.leo.out index 8613ddc150..c529e663ae 100644 --- a/tests/expectations/compiler/compiler/char/invalid_char.leo.out +++ b/tests/expectations/compiler/compiler/char/invalid_char.leo.out @@ -2,4 +2,4 @@ namespace: Compile expectation: Fail outputs: - - " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^^\n |\n = Empty character '' is not valid" + - " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^\n |\n = unexpected token: '''" diff --git a/tests/expectations/parser/parser/expression/literal/char.leo.out b/tests/expectations/parser/parser/expression/literal/char.leo.out index 0ce29e5695..d6e0fd1fc1 100644 --- a/tests/expectations/parser/parser/expression/literal/char.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char.leo.out @@ -4,14 +4,12 @@ expectation: Pass outputs: - "'a' @ 1:1-4" - "'Z' @ 1:1-4" - - "'\\\"' @ 1:1-5" - - "'\\t' @ 1:1-5" - - "'\\r' @ 1:1-5" - - "'\\0' @ 1:1-5" - - "'\\u{2764}' @ 1:1-11" - - "'\\u{306E}' @ 1:1-11" - - "'\\u{10FFFF}' @ 1:1-13" + - "'\"' @ 1:1-5" + - "'' @ 1:1-5" + - "'' @ 1:1-5" + - "'\u0000' @ 1:1-5" + - "'❤' @ 1:1-11" + - "'の' @ 1:1-11" - "'❤' @ 1:1-6" - "'の' @ 1:1-6" - - "'\\x0F' @ 1:1-7" - - "'\\x2A' @ 1:1-7" + - "'*' @ 1:1-7" diff --git a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out index 4c71c693af..fb3cc4bdcc 100644 --- a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out @@ -2,9 +2,7 @@ namespace: ParseExpression expectation: Fail outputs: - - " --> test:1:1\n |\n 1 | '\\'\n | ^^^\n |\n = Invalid escape character '\\\\'" + - " --> test:1:1\n |\n 1 | '\\'\n | ^\n |\n = unexpected token: '''" - " --> test:1:1\n |\n 1 | 'a\n | ^\n |\n = unexpected token: '''" - - " --> test:1:1\n |\n 1 | ''\n | ^^\n |\n = Empty character '' is not valid" - - " --> test:1:1\n |\n 1 | '\\x9'\n | ^^^^^\n |\n = Invalid singe hex character '\\x9', expected '\\x09" - - " --> test:1:1\n |\n 1 | '\\x80'\n | ^^^^^^\n |\n = Invalid singe hex character '\\x[56, 48]'" - - " --> test:1:1\n |\n 1 | '\\u{9999999}'\n | ^^^^^^^^^^^^^\n |\n = Invalid character '[57, 57, 57, 57, 57, 57, 57]'" + - " --> test:1:1\n |\n 1 | ''\n | ^\n |\n = unexpected token: '''" + - " --> test:1:1\n |\n 1 | '\\x9'\n | ^\n |\n = unexpected token: '''" diff --git a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out index 5ed19fa02e..3b080cd0a6 100644 --- a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out @@ -74,15 +74,6 @@ outputs: col_stop: 11 path: test content: "'\\u{306E}'" - - Value: - Char: - - 􏿿 - - line_start: 1 - line_stop: 1 - col_start: 1 - col_stop: 13 - path: test - content: "'\\u{10FFFF}'" - Value: Char: - ❤ @@ -101,15 +92,6 @@ outputs: col_stop: 6 path: test content: "'の'" - - Value: - Char: - - "\u000f" - - line_start: 1 - line_stop: 1 - col_start: 1 - col_stop: 7 - path: test - content: "'\\x0F'" - Value: Char: - "*" diff --git a/tests/parser/expression/literal/char.leo b/tests/parser/expression/literal/char.leo index 71babf34f3..5ea47f7dbf 100644 --- a/tests/parser/expression/literal/char.leo +++ b/tests/parser/expression/literal/char.leo @@ -11,8 +11,6 @@ expectation: Pass '\0' '\u{2764}' '\u{306E}' -'\u{10FFFF}' '❤' 'の' -'\x0F' '\x2A' \ No newline at end of file diff --git a/tests/parser/expression/literal/char_fail.leo b/tests/parser/expression/literal/char_fail.leo index 5991d04148..565c6f3922 100644 --- a/tests/parser/expression/literal/char_fail.leo +++ b/tests/parser/expression/literal/char_fail.leo @@ -9,8 +9,4 @@ expectation: Fail '' -'\x9' - -'\x80' - -'\u{9999999}' \ No newline at end of file +'\x9' \ No newline at end of file diff --git a/tests/parser/expression/literal/char_parse.leo b/tests/parser/expression/literal/char_parse.leo index 3c22c813ce..515f6b10f3 100644 --- a/tests/parser/expression/literal/char_parse.leo +++ b/tests/parser/expression/literal/char_parse.leo @@ -11,8 +11,6 @@ expectation: Pass '\0' '\u{2764}' '\u{306E}' -'\u{10FFFF}' '❤' 'の' -'\x0F' '\x2A' \ No newline at end of file