diff --git a/errors/src/parser/parser_errors.rs b/errors/src/parser/parser_errors.rs index 17757a94ba..15cbb19241 100644 --- a/errors/src/parser/parser_errors.rs +++ b/errors/src/parser/parser_errors.rs @@ -16,7 +16,7 @@ use crate::create_errors; -use std::fmt::Display; +use std::fmt::{Debug, Display}; create_errors!( /// ParserError enum that represents all the errors for the `leo-parser` crate. @@ -223,4 +223,76 @@ create_errors!( msg: "Array dimensions specified as a tuple cannot be empty.", help: None, } + + /// When an empty input tendril was expected but not found. + @backtraced + lexer_empty_input_tendril { + args: (), + msg: "Expected more characters to lex but found none.", + help: None, + } + + /// When an integer is started with a leading zero. + @backtraced + lexer_eat_integer_leading_zero { + args: (input: impl Display), + msg: format!("Tried to eat integer but found a leading zero on {}.", input), + help: None, + } + + /// When an integer is started with a leading zero. + @backtraced + lexer_expected_valid_escaped_char { + args: (input: impl Display), + msg: format!("Expected a valid escape character but found {}.", input), + help: None, + } + + /// When a string is not properly closed. + @backtraced + lexer_string_not_closed { + args: (input: impl Display), + msg: format!("Expected a closed string but found {}.", input), + help: None, + } + + /// When a string is not properly closed. + @backtraced + lexer_char_not_closed { + args: (input: impl Display), + msg: format!("Expected a closed char but found {}.", input), + help: None, + } + + /// When a string is not properly closed. + @backtraced + lexer_invalid_char { + args: (input: impl Display), + msg: format!("Expected valid character but found {}.", input), + help: None, + } + + /// When a block comment is empty. + @backtraced + lexer_empty_block_comment { + args: (), + msg: "Empty block comment.", + help: None, + } + + /// When a block comment is not closed before end of file. + @backtraced + lexer_block_comment_does_not_close_before_eof { + args: (input: impl Display), + msg: format!("Block comment does not close with content: {}.", input), + help: None, + } + + /// When the lexer could not lex some text. + @backtraced + could_not_lex { + args: (input: impl Display), + msg: format!("Could not lex the following content: {}.", input), + help: None, + } ); diff --git a/parser/src/tokenizer/lexer.rs b/parser/src/tokenizer/lexer.rs index 37e899a03f..be73b78d95 100644 --- a/parser/src/tokenizer/lexer.rs +++ b/parser/src/tokenizer/lexer.rs @@ -15,6 +15,7 @@ // along with the Leo library. If not, see . use crate::tokenizer::{Char, Token}; +use leo_errors::{ParserError, Result}; use leo_span::{Span, Symbol}; use serde::{Deserialize, Serialize}; @@ -147,13 +148,13 @@ impl Token { /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`]. /// An integer can be eaten if its bytes are at the front of the given `input_tendril` string. /// - fn eat_integer(input_tendril: &StrTendril) -> (usize, Option) { + fn eat_integer(input_tendril: &StrTendril) -> Result<(usize, Token)> { if input_tendril.is_empty() { - return (0, None); + return Err(ParserError::lexer_empty_input_tendril().into()); } let input = input_tendril[..].as_bytes(); if !input[0].is_ascii_digit() { - return (0, None); + return Err(ParserError::lexer_eat_integer_leading_zero(String::from_utf8_lossy(input)).into()); } let mut i = 1; let mut is_hex = false; @@ -173,7 +174,7 @@ impl Token { i += 1; } - (i, Some(Token::Int(input_tendril.subtendril(0, i as u32)))) + Ok((i, Token::Int(input_tendril.subtendril(0, i as u32)))) } /// Returns the number of bytes in an emoji via a bit mask. @@ -197,13 +198,13 @@ impl Token { /// Returns a tuple: [(token length, token)] if the next token can be eaten, otherwise returns [`None`]. /// The next token can be eaten if the bytes at the front of the given `input_tendril` string can be scanned into a token. /// - pub(crate) fn eat(input_tendril: StrTendril) -> (usize, Option) { + pub(crate) fn eat(input_tendril: StrTendril) -> Result<(usize, Token)> { if input_tendril.is_empty() { - return (0, None); + return Err(ParserError::lexer_empty_input_tendril().into()); } let input = input_tendril[..].as_bytes(); match input[0] { - x if x.is_ascii_whitespace() => return (1, None), + x if x.is_ascii_whitespace() => return Ok((1, Token::WhiteSpace)), b'"' => { let mut i = 1; let mut len: u8 = 1; @@ -270,7 +271,12 @@ impl Token { unicode = false; string.push(character.into()); } - None => return (0, None), + None => { + return Err(ParserError::lexer_expected_valid_escaped_char( + input_tendril.subtendril(start as u32, len as u32), + ) + .into()) + } } } @@ -282,10 +288,10 @@ impl Token { } if i == input.len() || !end { - return (0, None); + return Err(ParserError::lexer_string_not_closed(String::from_utf8_lossy(&input[0..i])).into()); } - return (i + 1, Some(Token::StringLit(string))); + return Ok((i + 1, Token::StringLit(string))); } b'\'' => { let mut i = 1; @@ -310,7 +316,7 @@ impl Token { if input[i + 1] == b'{' { unicode = true; } else { - return (0, None); + return Err(ParserError::lexer_expected_valid_escaped_char(input[i]).into()); } } else { escaped = true; @@ -323,12 +329,12 @@ impl Token { } if !end { - return (0, None); + return Err(ParserError::lexer_char_not_closed(String::from_utf8_lossy(&input[0..i])).into()); } return match Self::eat_char(input_tendril.subtendril(1, (i - 1) as u32), escaped, hex, unicode) { - Some(character) => (i + 1, Some(Token::CharLit(character))), - None => (0, None), + Some(character) => Ok((i + 1, Token::CharLit(character))), + None => Err(ParserError::lexer_invalid_char(String::from_utf8_lossy(&input[0..i - 1])).into()), }; } x if x.is_ascii_digit() => { @@ -336,119 +342,122 @@ impl Token { } b'!' => { if let Some(len) = eat(input, "!=") { - return (len, Some(Token::NotEq)); + return Ok((len, Token::NotEq)); } - return (1, Some(Token::Not)); + return Ok((1, Token::Not)); } b'?' => { - return (1, Some(Token::Question)); + return Ok((1, Token::Question)); } b'&' => { if let Some(len) = eat(input, "&&") { - return (len, Some(Token::And)); + return Ok((len, Token::And)); } - return (1, Some(Token::Ampersand)); + return Ok((1, Token::Ampersand)); } - b'(' => return (1, Some(Token::LeftParen)), - b')' => return (1, Some(Token::RightParen)), - b'_' => return (1, Some(Token::Underscore)), + b'(' => return Ok((1, Token::LeftParen)), + b')' => return Ok((1, Token::RightParen)), + b'_' => return Ok((1, Token::Underscore)), b'*' => { if let Some(len) = eat(input, "**") { if let Some(inner_len) = eat(&input[len..], "=") { - return (len + inner_len, Some(Token::ExpEq)); + return Ok((len + inner_len, Token::ExpEq)); } - return (len, Some(Token::Exp)); + return Ok((len, Token::Exp)); } else if let Some(len) = eat(input, "*=") { - return (len, Some(Token::MulEq)); + return Ok((len, Token::MulEq)); } - return (1, Some(Token::Mul)); + return Ok((1, Token::Mul)); } b'+' => { if let Some(len) = eat(input, "+=") { - return (len, Some(Token::AddEq)); + return Ok((len, Token::AddEq)); } - return (1, Some(Token::Add)); + return Ok((1, Token::Add)); } - b',' => return (1, Some(Token::Comma)), + b',' => return Ok((1, Token::Comma)), b'-' => { if let Some(len) = eat(input, "->") { - return (len, Some(Token::Arrow)); + return Ok((len, Token::Arrow)); } else if let Some(len) = eat(input, "-=") { - return (len, Some(Token::MinusEq)); + return Ok((len, Token::MinusEq)); } - return (1, Some(Token::Minus)); + return Ok((1, Token::Minus)); } b'.' => { if let Some(len) = eat(input, "...") { - return (len, Some(Token::DotDotDot)); + return Ok((len, Token::DotDotDot)); } else if let Some(len) = eat(input, "..") { - return (len, Some(Token::DotDot)); + return Ok((len, Token::DotDot)); } - return (1, Some(Token::Dot)); + return Ok((1, Token::Dot)); } b'/' => { if eat(input, "//").is_some() { let eol = input.iter().position(|x| *x == b'\n'); let len = if let Some(eol) = eol { eol + 1 } else { input.len() }; - return (len, Some(Token::CommentLine(input_tendril.subtendril(0, len as u32)))); + return Ok((len, Token::CommentLine(input_tendril.subtendril(0, len as u32)))); } else if eat(input, "/*").is_some() { if input.is_empty() { - return (0, None); + return Err(ParserError::lexer_empty_block_comment().into()); } let eol = input.windows(2).skip(2).position(|x| x[0] == b'*' && x[1] == b'/'); let len = if let Some(eol) = eol { eol + 4 } else { - return (0, None); + return Err(ParserError::lexer_block_comment_does_not_close_before_eof( + String::from_utf8_lossy(&input[0..]), + ) + .into()); }; - return (len, Some(Token::CommentBlock(input_tendril.subtendril(0, len as u32)))); + return Ok((len, Token::CommentBlock(input_tendril.subtendril(0, len as u32)))); } else if let Some(len) = eat(input, "/=") { - return (len, Some(Token::DivEq)); + return Ok((len, Token::DivEq)); } - return (1, Some(Token::Div)); + return Ok((1, Token::Div)); } b':' => { if let Some(len) = eat(input, "::") { - return (len, Some(Token::DoubleColon)); + return Ok((len, Token::DoubleColon)); } else { - return (1, Some(Token::Colon)); + return Ok((1, Token::Colon)); } } - b';' => return (1, Some(Token::Semicolon)), + b';' => return Ok((1, Token::Semicolon)), b'<' => { if let Some(len) = eat(input, "<=") { - return (len, Some(Token::LtEq)); + return Ok((len, Token::LtEq)); } - return (1, Some(Token::Lt)); + return Ok((1, Token::Lt)); } b'>' => { if let Some(len) = eat(input, ">=") { - return (len, Some(Token::GtEq)); + return Ok((len, Token::GtEq)); } - return (1, Some(Token::Gt)); + return Ok((1, Token::Gt)); } b'=' => { if let Some(len) = eat(input, "==") { - return (len, Some(Token::Eq)); + return Ok((len, Token::Eq)); } - return (1, Some(Token::Assign)); + return Ok((1, Token::Assign)); } - b'@' => return (1, Some(Token::At)), - b'[' => return (1, Some(Token::LeftSquare)), - b']' => return (1, Some(Token::RightSquare)), - b'{' => return (1, Some(Token::LeftCurly)), - b'}' => return (1, Some(Token::RightCurly)), + b'@' => return Ok((1, Token::At)), + b'[' => return Ok((1, Token::LeftSquare)), + b']' => return Ok((1, Token::RightSquare)), + b'{' => return Ok((1, Token::LeftCurly)), + b'}' => return Ok((1, Token::RightCurly)), b'|' => { if let Some(len) = eat(input, "||") { - return (len, Some(Token::Or)); + return Ok((len, Token::Or)); } } _ => (), } if let Some(ident) = eat_identifier(&input_tendril) { - return ( + return Ok(( ident.len(), - Some(match &*ident { + match &*ident { x if x.starts_with("aleo1") => Token::AddressLit(ident), "address" => Token::Address, "as" => Token::As, @@ -486,11 +495,11 @@ impl Token { "u64" => Token::U64, "u128" => Token::U128, _ => Token::Ident(Symbol::intern(&ident)), - }), - ); + }, + )); } - (0, None) + Err(ParserError::could_not_lex(String::from_utf8_lossy(&input[0..])).into()) } } diff --git a/parser/src/tokenizer/mod.rs b/parser/src/tokenizer/mod.rs index 52fded2834..f48b3222e4 100644 --- a/parser/src/tokenizer/mod.rs +++ b/parser/src/tokenizer/mod.rs @@ -28,21 +28,55 @@ pub(crate) use self::token::*; pub(crate) mod lexer; pub(crate) use self::lexer::*; -use leo_errors::{LeoError, ParserError}; +use leo_errors::{ParserError, Result}; use leo_span::Span; use tendril::StrTendril; /// Creates a new vector of spanned tokens from a given file path and source code text. -pub(crate) fn tokenize(path: &str, input: StrTendril) -> Result, LeoError> { +pub(crate) fn tokenize(path: &str, input: StrTendril) -> Result> { let path = Arc::new(path.to_string()); let mut tokens = vec![]; let mut index = 0usize; let mut line_no = 1usize; let mut line_start = 0usize; while input.len() > index { - match Token::eat(input.subtendril(index as u32, (input.len() - index) as u32)) { - (token_len, Some(token)) => { + match Token::eat(input.subtendril(index as u32, (input.len() - index) as u32))? { + (token_len, Token::WhiteSpace) => { + if token_len == 0 && index == input.len() { + break; + } else if token_len == 0 { + return Err(ParserError::unexpected_token( + &input[index..].chars().next().unwrap(), + &Span::new( + line_no, + line_no, + index - line_start + 1, + index - line_start + 2, + path, + input.subtendril( + line_start as u32, + input[line_start..].find('\n').unwrap_or_else(|| input.len()) as u32, + ), + ), + ) + .into()); + } + + let bytes = input.as_bytes(); + if bytes[index] == 0x000D && matches!(bytes.get(index + 1), Some(0x000A)) { + // Check carriage return followed by newline. + line_no += 1; + line_start = index + token_len; + index += token_len; + } else if matches!(bytes[index], 0x000A | 0x000D) { + // Check new-line or carriage-return + line_no += 1; + line_start = index + token_len; + } + index += token_len; + } + (token_len, token) => { let mut span = Span::new( line_no, line_no, @@ -79,32 +113,6 @@ pub(crate) fn tokenize(path: &str, input: StrTendril) -> Result { - if token_len == 0 && index == input.len() { - break; - } else if token_len == 0 { - return Err(ParserError::unexpected_token( - &input[index..].chars().next().unwrap(), - &Span::new( - line_no, - line_no, - index - line_start + 1, - index - line_start + 2, - path, - input.subtendril( - line_start as u32, - input[line_start..].find('\n').unwrap_or_else(|| input.len()) as u32, - ), - ), - ) - .into()); - } - if input.as_bytes()[index] == b'\n' { - line_no += 1; - line_start = index + token_len; - } - index += token_len; - } } } Ok(tokens) @@ -214,7 +222,7 @@ mod tests { .unwrap(); let mut output = String::new(); for SpannedToken { token, .. } in tokens.iter() { - output += &format!("{} ", token.to_string()); + output += &format!("{} ", token); } assert_eq!( @@ -229,7 +237,7 @@ mod tests { fn test_spans() { create_session_if_not_set_then(|_| { let raw = r#" - test +ppp test // test test /* test */ diff --git a/parser/src/tokenizer/token.rs b/parser/src/tokenizer/token.rs index ce3347beb0..2ee7c55ff7 100644 --- a/parser/src/tokenizer/token.rs +++ b/parser/src/tokenizer/token.rs @@ -59,6 +59,7 @@ pub enum Token { False, AddressLit(#[serde(with = "leo_span::tendril_json")] StrTendril), CharLit(Char), + WhiteSpace, // Symbols At, @@ -258,6 +259,7 @@ impl fmt::Display for Token { False => write!(f, "false"), AddressLit(s) => write!(f, "{}", s), CharLit(s) => write!(f, "{}", s), + WhiteSpace => write!(f, "whitespace"), At => write!(f, "@"), diff --git a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out index 46390f4fb9..4642d0114d 100644 --- a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out @@ -2,38 +2,38 @@ namespace: Token expectation: Fail outputs: - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | 'a\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | ''\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\x9A'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\x7'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\x7g'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\xz'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\x80'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\xc1'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\xc2'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\xDF'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\xC0'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\xe0'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\x9f'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | 'abcdefg'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\t\\t'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\a'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\z'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\A'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\Z'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\1'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\9'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\*'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\x'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u{bbbbb}\\u{aaaa}'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\uz'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u1'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u123'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u{2764z'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u{276g}'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u00000000'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u01000000'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '\\u9999999'\n | ^" - - "Error [EPAR0370000]: '\n --> test:1:1\n |\n 1 | '😭😂😘'\n | ^" + - "Error [EPAR0370028]: Expected a closed char but found '\\'." + - "Error [EPAR0370028]: Expected a closed char but found 'a." + - "Error [EPAR0370029]: Expected valid character but found ." + - "Error [EPAR0370029]: Expected valid character but found '\\x9." + - "Error [EPAR0370029]: Expected valid character but found '\\x." + - "Error [EPAR0370029]: Expected valid character but found '\\x7." + - "Error [EPAR0370029]: Expected valid character but found '\\x." + - "Error [EPAR0370029]: Expected valid character but found '\\x8." + - "Error [EPAR0370029]: Expected valid character but found '\\xc." + - "Error [EPAR0370029]: Expected valid character but found '\\xc." + - "Error [EPAR0370029]: Expected valid character but found '\\xD." + - "Error [EPAR0370029]: Expected valid character but found '\\xC." + - "Error [EPAR0370029]: Expected valid character but found '\\xe." + - "Error [EPAR0370029]: Expected valid character but found '\\x9." + - "Error [EPAR0370029]: Expected valid character but found 'abcdef." + - "Error [EPAR0370029]: Expected valid character but found '\\t\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370029]: Expected valid character but found '\\." + - "Error [EPAR0370026]: Expected a valid escape character but found 117." + - "Error [EPAR0370029]: Expected valid character but found '\\u{bbbbb}\\u{aaaa." + - "Error [EPAR0370026]: Expected a valid escape character but found 117." + - "Error [EPAR0370026]: Expected a valid escape character but found 117." + - "Error [EPAR0370026]: Expected a valid escape character but found 117." + - "Error [EPAR0370029]: Expected valid character but found '\\u{2764." + - "Error [EPAR0370029]: Expected valid character but found '\\u{276g." + - "Error [EPAR0370026]: Expected a valid escape character but found 117." + - "Error [EPAR0370026]: Expected a valid escape character but found 117." + - "Error [EPAR0370026]: Expected a valid escape character but found 117." + - "Error [EPAR0370029]: Expected valid character but found '😭😂�." diff --git a/tests/expectations/parser/parser/expression/literal/string_fail.leo.out b/tests/expectations/parser/parser/expression/literal/string_fail.leo.out index 4d8c97fc71..2b1fcd8d31 100644 --- a/tests/expectations/parser/parser/expression/literal/string_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/string_fail.leo.out @@ -2,10 +2,10 @@ namespace: Token expectation: Fail outputs: - - "Error [EPAR0370000]: \"\n --> test:1:1\n |\n 1 | \"Hello world!\n | ^" - - "Error [EPAR0370000]: \"\n --> test:1:1\n |\n 1 | \"\\\"\n | ^" - - "Error [EPAR0370000]: \"\n --> test:1:1\n |\n 1 | \"\\l\"\n | ^" - - "Error [EPAR0370000]: \"\n --> test:1:1\n |\n 1 | \"\\uaaa\"\n | ^" - - "Error [EPAR0370000]: \"\n --> test:1:1\n |\n 1 | \"\\u\"\n | ^" - - "Error [EPAR0370000]: \"\n --> test:1:1\n |\n 1 | \"\\xFF\"\n | ^" - - "Error [EPAR0370000]: \"\n --> test:1:1\n |\n 1 | \"\\x\"\n | ^" + - "Error [EPAR0370027]: Expected a closed string but found \"Hello world!." + - "Error [EPAR0370027]: Expected a closed string but found \"\\\"." + - "Error [EPAR0370026]: Expected a valid escape character but found \\l." + - "Error [EPAR0370027]: Expected a closed string but found \"\\uaaa\"." + - "Error [EPAR0370027]: Expected a closed string but found \"\\u\"." + - "Error [EPAR0370026]: Expected a valid escape character but found \\xFF." + - "Error [EPAR0370027]: Expected a closed string but found \"\\x\"."