diff --git a/input/src/expressions/string_expression.rs b/input/src/expressions/string_expression.rs new file mode 100644 index 0000000000..0b479abce7 --- /dev/null +++ b/input/src/expressions/string_expression.rs @@ -0,0 +1,39 @@ +// Copyright (C) 2019-2021 Aleo Systems Inc. +// This file is part of the Leo library. + +// The Leo library is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// The Leo library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with the Leo library. If not, see . + +use crate::{ast::Rule, values::CharValue}; + +use pest::Span; +use pest_ast::FromPest; +use std::fmt; + +#[derive(Clone, Debug, FromPest, PartialEq, Eq)] +#[pest_ast(rule(Rule::expression_string))] +pub struct StringExpression<'ast> { + pub chars: Vec>, + #[pest_ast(outer())] + pub span: Span<'ast>, +} + +impl<'ast> fmt::Display for StringExpression<'ast> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for character in self.chars.iter() { + write!(f, "{:?}", character)?; + } + + Ok(()) + } +} diff --git a/parser/src/errors/syntax.rs b/parser/src/errors/syntax.rs index b1062527e9..98fbc6bad8 100644 --- a/parser/src/errors/syntax.rs +++ b/parser/src/errors/syntax.rs @@ -48,46 +48,6 @@ impl SyntaxError { SyntaxError::Error(FormattedError::new_from_span(message, span)) } - pub fn invalid_char(character: Vec, span: &Span) -> Self { - Self::new_from_span(format!("Invalid character '{:?}'", character), span) - } - - pub fn invalid_empty_char(span: &Span) -> Self { - Self::new_from_span("Empty character '' is not valid".to_string(), span) - } - - pub fn invalid_escaped_char(character: char, span: &Span) -> Self { - Self::new_from_span(format!("Invalid escape character '\\{}'", character), span) - } - - pub fn invalid_hex_char(character: Vec, span: &Span) -> Self { - Self::new_from_span(format!("Invalid singe hex character '\\x{:?}'", character), span) - } - - pub fn invalid_hex_single_char(character: char, span: &Span) -> Self { - Self::new_from_span( - format!( - "Invalid singe hex character '\\x{}', expected '\\x0{}", - character, character - ), - span, - ) - } - - pub fn invalid_unicode_char(character: Vec, escaped: bool, span: &Span) -> Self { - if escaped { - return Self::new_from_span( - format!("Invalid unicode escaped character '\\u{{{:?}}}'", character), - span, - ); - } - - Self::new_from_span( - format!("Invalid unicode symbol character '\\u{{{:?}}}'", character), - span, - ) - } - pub fn invalid_import_list(span: &Span) -> Self { Self::new_from_span("Cannot import empty list".to_string(), span) } diff --git a/parser/src/parser/expression.rs b/parser/src/parser/expression.rs index f128349a7a..dc0e8aca01 100644 --- a/parser/src/parser/expression.rs +++ b/parser/src/parser/expression.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with the Leo library. If not, see . -use tendril::{format_tendril, StrTendril}; +use tendril::format_tendril; use super::*; @@ -643,120 +643,6 @@ impl ParserContext { } } - /// - /// Returns a character if it is a valid character that can be parsed. - /// - fn parse_char(&mut self, input_tendril: StrTendril, span: Span) -> SyntaxResult { - if input_tendril.is_empty() { - return Err(SyntaxError::invalid_empty_char(&span)); - } - - let input = input_tendril[..].as_bytes(); - let mut i = 0; - let mut escaped = false; - let mut hex = false; - let mut unicode = false; - let mut characters: Vec = vec![]; - - while i < input.len() { - if !escaped { - if input[i] == b'{' { - i += 1; - characters.clear(); - continue; - } - - if input[i] == b'}' { - i += 1; - continue; - } - } else { - escaped = false; - characters.clear(); - - match input[i] { - b'0' => characters.push(0), - b't' => characters.push(9), - b'n' => characters.push(10), - b'r' => characters.push(13), - b'\"' => characters.push(34), - b'\'' => characters.push(39), - b'\\' => characters.push(92), - b'x' => { - hex = true; - - i += 1; - continue; - } - b'u' => { - unicode = true; - } - _ => { - return Err(SyntaxError::invalid_escaped_char(input[i] as char, &span)); - } - } - - i += 1; - - continue; - } - - if input[i] == b'\\' { - escaped = true; - } - - characters.push(input[i]); - i += 1; - } - - return match characters.len() { - 1 | 2 | 3 | 4 | 5 | 6 if unicode => { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(hex) = u32::from_str_radix(&string, 16) { - if hex <= 0x10FFFF { - if let Some(unicode_char) = std::char::from_u32(hex) { - return Ok(Expression::Value(ValueExpression::Char(unicode_char, span))); - } - } - } - } - - Err(SyntaxError::invalid_unicode_char(characters, true, &span)) - } - 1 => { - if hex { - return Err(SyntaxError::invalid_hex_single_char(characters[0] as char, &span)); - } else if escaped { - return Err(SyntaxError::invalid_escaped_char(characters[0] as char, &span)); - } - - Ok(Expression::Value(ValueExpression::Char(characters[0] as char, span))) - } - 2 if hex => { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(number) = u8::from_str_radix(&string, 16) { - if number <= 127 { - return Ok(Expression::Value(ValueExpression::Char(number as char, span))); - } - } - } - - Err(SyntaxError::invalid_hex_char(characters, &span)) - } - 3 | 4 => { - // direct unicode symbol - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Some(character) = string.chars().next() { - return Ok(Expression::Value(ValueExpression::Char(character, span))); - } - } - - Err(SyntaxError::invalid_unicode_char(characters, false, &span)) - } - _ => Err(SyntaxError::invalid_char(characters, &span)), - }; - } - /// /// Returns an [`Expression`] AST node if the next token is a primary expression: /// - Literals: field, group, unsigned integer, signed integer, boolean, address @@ -803,7 +689,7 @@ impl ParserContext { Token::True => Expression::Value(ValueExpression::Boolean("true".into(), span)), Token::False => Expression::Value(ValueExpression::Boolean("false".into(), span)), Token::AddressLit(value) => Expression::Value(ValueExpression::Address(value, span)), - Token::CharLit(value) => self.parse_char(value, span)?, + Token::CharLit(value) => Expression::Value(ValueExpression::Char(value, span)), Token::StringLiteral(value) => Expression::Value(ValueExpression::String(value, span)), Token::LeftParen => self.parse_tuple_expression(&span)?, Token::LeftSquare => self.parse_array_expression(&span)?, diff --git a/parser/src/tokenizer/lexer.rs b/parser/src/tokenizer/lexer.rs index b8569c6673..b61c09ce51 100644 --- a/parser/src/tokenizer/lexer.rs +++ b/parser/src/tokenizer/lexer.rs @@ -62,106 +62,66 @@ fn eat_identifier(input_tendril: &StrTendril) -> Option { impl Token { /// - /// Eats String. Returns Token::StringLiteral with processed contents of the string. + /// Returns a new `Token::CharLit` if an character can be eaten, otherwise returns [`None`]. /// - fn eat_string(input_tendril: &StrTendril) -> (usize, Option) { + fn eat_char(input_tendril: StrTendril, escaped: bool, hex: bool, unicode: bool) -> Option { if input_tendril.is_empty() { - return (0, None); + return None; } - let input = input_tendril[..].as_bytes(); - let mut collect: Vec = Vec::new(); - let mut iter = input.iter().enumerate().skip(1); + if escaped { + let string = input_tendril.to_string(); + let escaped = &string[1..string.len()]; - while let Some((i, symbol)) = iter.next() { - let symbol = *symbol; - - if symbol == b'`' { - return (i + 1, Some(Token::StringLiteral(collect))); + if escaped.len() != 1 { + return None; } - // Process escapes. - if symbol == b'\\' { - if let Some((_, escaped)) = iter.next() { - match escaped { - b'0' => collect.push(0 as char), - b't' => collect.push(9 as char), - b'n' => collect.push(10 as char), - b'r' => collect.push(13 as char), - b'\"' => collect.push(34 as char), - b'\'' => collect.push(39 as char), - b'\\' => collect.push(92 as char), - // \x0F - 2 HEX digits after \x - b'x' => { - // get first symbol - if let Some((_, first_hex)) = iter.next() { - // get second symbol - if let Some((_, second_hex)) = iter.next() { - if let Ok(string) = std::str::from_utf8(&[*first_hex, *second_hex]) { - if let Ok(number) = u8::from_str_radix(&string, 16) { - if number <= 127 { - collect.push(number as char); - continue; - } - } - } - } - } + if let Some(character) = escaped.chars().next() { + return match character { + '0' => Some(Token::CharLit(0 as char)), + 't' => Some(Token::CharLit(9 as char)), + 'n' => Some(Token::CharLit(10 as char)), + 'r' => Some(Token::CharLit(13 as char)), + '\"' => Some(Token::CharLit(34 as char)), + '\'' => Some(Token::CharLit(39 as char)), + '\\' => Some(Token::CharLit(92 as char)), + _ => None, + }; + } else { + return None; + } + } - return (0, None); - } + if hex { + let string = input_tendril.to_string(); + let hex_string = &string[2..string.len()]; - // \u{1-6 hex digits} - b'u' => { - if let Some((start, open_brace)) = iter.next() { - if *open_brace == b'{' { - let mut characters: Vec = Vec::new(); + if hex_string.len() != 2 { + return None; + } - while let Some((end, symbol)) = iter.next() { - if end > start + 7 { - return (0, None); - } + if let Ok(ascii_number) = u8::from_str_radix(&hex_string, 16) { + return Some(Token::CharLit(ascii_number as char)); + } + } - match *symbol { - 0..=9 | b'a'..=b'f' | b'A'..=b'F' => characters.push(*symbol), - b'}' => { - if let Ok(unicode_string) = std::str::from_utf8(&characters[..]) { - if let Ok(hex) = u32::from_str_radix(&unicode_string, 16) { - if let Some(unicode_char) = std::char::from_u32(hex) { - collect.push(unicode_char); - break; - } - } - } + if unicode { + let string = input_tendril.to_string(); + let unicode_number = &string[3..string.len() - 1]; - return (0, None); - } - _ => { - return (0, None); - } - } - } - - continue; - } - } - - return (0, None); - } - _ => { - return (0, None); - } - } - continue; + if let Ok(hex) = u32::from_str_radix(&unicode_number, 16) { + if let Some(character) = std::char::from_u32(hex) { + return Some(Token::CharLit(character)); } - - return (0, None); } - - collect.push(symbol as char); } - (0, None) + if let Some(character) = input_tendril.to_string().chars().next() { + return Some(Token::CharLit(character)); + } + + None } /// @@ -208,9 +168,6 @@ impl Token { let input = input_tendril[..].as_bytes(); match input[0] { x if x.is_ascii_whitespace() => return (1, None), - b'`' => { - return Self::eat_string(&input_tendril); - } b'"' => { let mut i = 1; let mut in_escape = false; @@ -258,12 +215,30 @@ impl Token { } b'\'' => { let mut i = 1; + let mut in_escape = false; + let mut escaped = false; + let mut hex = false; + let mut unicode = false; let mut end = false; while i < input.len() { - if input[i] == b'\'' { - end = true; - break; + if !in_escape { + if input[i] == b'\'' { + end = true; + break; + } else if input[i] == b'\\' { + in_escape = true; + } + } else { + if input[i] == b'x' { + hex = true; + } else if input[i] == b'u' { + unicode = true; + } else { + escaped = true; + } + + in_escape = false; } i += 1; @@ -273,7 +248,13 @@ impl Token { return (0, None); } - return (i + 1, Some(Token::CharLit(input_tendril.subtendril(1, (i - 1) as u32)))); + let result = Self::eat_char(input_tendril.subtendril(1, (i - 1) as u32), escaped, hex, unicode); + + if result.is_none() { + return (0, None); + } + + return (i + 1, result); } x if x.is_ascii_digit() => { return Self::eat_integer(&input_tendril); diff --git a/parser/src/tokenizer/token.rs b/parser/src/tokenizer/token.rs index b817e15723..1cab255843 100644 --- a/parser/src/tokenizer/token.rs +++ b/parser/src/tokenizer/token.rs @@ -48,7 +48,7 @@ pub enum Token { True, False, AddressLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), - CharLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), + CharLit(char), At, diff --git a/tests/expectations/compiler/compiler/char/invalid_char.leo.out b/tests/expectations/compiler/compiler/char/invalid_char.leo.out index 8613ddc150..c529e663ae 100644 --- a/tests/expectations/compiler/compiler/char/invalid_char.leo.out +++ b/tests/expectations/compiler/compiler/char/invalid_char.leo.out @@ -2,4 +2,4 @@ namespace: Compile expectation: Fail outputs: - - " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^^\n |\n = Empty character '' is not valid" + - " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^\n |\n = unexpected token: '''" diff --git a/tests/expectations/parser/parser/expression/literal/char.leo.out b/tests/expectations/parser/parser/expression/literal/char.leo.out index 0ce29e5695..d6e0fd1fc1 100644 --- a/tests/expectations/parser/parser/expression/literal/char.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char.leo.out @@ -4,14 +4,12 @@ expectation: Pass outputs: - "'a' @ 1:1-4" - "'Z' @ 1:1-4" - - "'\\\"' @ 1:1-5" - - "'\\t' @ 1:1-5" - - "'\\r' @ 1:1-5" - - "'\\0' @ 1:1-5" - - "'\\u{2764}' @ 1:1-11" - - "'\\u{306E}' @ 1:1-11" - - "'\\u{10FFFF}' @ 1:1-13" + - "'\"' @ 1:1-5" + - "'' @ 1:1-5" + - "'' @ 1:1-5" + - "'\u0000' @ 1:1-5" + - "'❤' @ 1:1-11" + - "'の' @ 1:1-11" - "'❤' @ 1:1-6" - "'の' @ 1:1-6" - - "'\\x0F' @ 1:1-7" - - "'\\x2A' @ 1:1-7" + - "'*' @ 1:1-7" diff --git a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out index 4c71c693af..fb3cc4bdcc 100644 --- a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out @@ -2,9 +2,7 @@ namespace: ParseExpression expectation: Fail outputs: - - " --> test:1:1\n |\n 1 | '\\'\n | ^^^\n |\n = Invalid escape character '\\\\'" + - " --> test:1:1\n |\n 1 | '\\'\n | ^\n |\n = unexpected token: '''" - " --> test:1:1\n |\n 1 | 'a\n | ^\n |\n = unexpected token: '''" - - " --> test:1:1\n |\n 1 | ''\n | ^^\n |\n = Empty character '' is not valid" - - " --> test:1:1\n |\n 1 | '\\x9'\n | ^^^^^\n |\n = Invalid singe hex character '\\x9', expected '\\x09" - - " --> test:1:1\n |\n 1 | '\\x80'\n | ^^^^^^\n |\n = Invalid singe hex character '\\x[56, 48]'" - - " --> test:1:1\n |\n 1 | '\\u{9999999}'\n | ^^^^^^^^^^^^^\n |\n = Invalid character '[57, 57, 57, 57, 57, 57, 57]'" + - " --> test:1:1\n |\n 1 | ''\n | ^\n |\n = unexpected token: '''" + - " --> test:1:1\n |\n 1 | '\\x9'\n | ^\n |\n = unexpected token: '''" diff --git a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out index 5ed19fa02e..3b080cd0a6 100644 --- a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out @@ -74,15 +74,6 @@ outputs: col_stop: 11 path: test content: "'\\u{306E}'" - - Value: - Char: - - 􏿿 - - line_start: 1 - line_stop: 1 - col_start: 1 - col_stop: 13 - path: test - content: "'\\u{10FFFF}'" - Value: Char: - ❤ @@ -101,15 +92,6 @@ outputs: col_stop: 6 path: test content: "'の'" - - Value: - Char: - - "\u000f" - - line_start: 1 - line_stop: 1 - col_start: 1 - col_stop: 7 - path: test - content: "'\\x0F'" - Value: Char: - "*" diff --git a/tests/parser/expression/literal/char.leo b/tests/parser/expression/literal/char.leo index 71babf34f3..5ea47f7dbf 100644 --- a/tests/parser/expression/literal/char.leo +++ b/tests/parser/expression/literal/char.leo @@ -11,8 +11,6 @@ expectation: Pass '\0' '\u{2764}' '\u{306E}' -'\u{10FFFF}' '❤' 'の' -'\x0F' '\x2A' \ No newline at end of file diff --git a/tests/parser/expression/literal/char_fail.leo b/tests/parser/expression/literal/char_fail.leo index 5991d04148..565c6f3922 100644 --- a/tests/parser/expression/literal/char_fail.leo +++ b/tests/parser/expression/literal/char_fail.leo @@ -9,8 +9,4 @@ expectation: Fail '' -'\x9' - -'\x80' - -'\u{9999999}' \ No newline at end of file +'\x9' \ No newline at end of file diff --git a/tests/parser/expression/literal/char_parse.leo b/tests/parser/expression/literal/char_parse.leo index 3c22c813ce..515f6b10f3 100644 --- a/tests/parser/expression/literal/char_parse.leo +++ b/tests/parser/expression/literal/char_parse.leo @@ -11,8 +11,6 @@ expectation: Pass '\0' '\u{2764}' '\u{306E}' -'\u{10FFFF}' '❤' 'の' -'\x0F' '\x2A' \ No newline at end of file