Merge branch 'feature/string-parsing' of github.com:AleoHQ/leo into feature/string-canonicalization

This commit is contained in:
gluax 2021-05-20 12:40:08 -04:00
commit 947f13b77b
13 changed files with 219 additions and 147 deletions

View File

@ -48,6 +48,46 @@ impl SyntaxError {
SyntaxError::Error(FormattedError::new_from_span(message, span))
}
pub fn invalid_char(character: Vec<u8>, span: &Span) -> Self {
Self::new_from_span(format!("Invalid character '{:?}'", character), span)
}
pub fn invalid_empty_char(span: &Span) -> Self {
Self::new_from_span("Empty character '' is not valid".to_string(), span)
}
pub fn invalid_escaped_char(character: char, span: &Span) -> Self {
Self::new_from_span(format!("Invalid escape character '\\{}'", character), span)
}
pub fn invalid_hex_char(character: Vec<u8>, span: &Span) -> Self {
Self::new_from_span(format!("Invalid singe hex character '\\x{:?}'", character), span)
}
pub fn invalid_hex_single_char(character: char, span: &Span) -> Self {
Self::new_from_span(
format!(
"Invalid singe hex character '\\x{}', expected '\\x0{}",
character, character
),
span,
)
}
pub fn invalid_unicode_char(character: Vec<u8>, escaped: bool, span: &Span) -> Self {
if escaped {
return Self::new_from_span(
format!("Invalid unicode escaped character '\\u{{{:?}}}'", character),
span,
);
}
Self::new_from_span(
format!("Invalid unicode symbol character '\\u{{{:?}}}'", character),
span,
)
}
pub fn invalid_import_list(span: &Span) -> Self {
Self::new_from_span("Cannot import empty list".to_string(), span)
}

View File

@ -14,7 +14,7 @@
// You should have received a copy of the GNU General Public License
// along with the Leo library. If not, see <https://www.gnu.org/licenses/>.
use tendril::format_tendril;
use tendril::{format_tendril, StrTendril};
use super::*;
@ -643,6 +643,120 @@ impl ParserContext {
}
}
///
/// Returns a character if it is a valid character that can be parsed.
///
fn parse_char(&mut self, input_tendril: StrTendril, span: Span) -> SyntaxResult<Expression> {
if input_tendril.is_empty() {
return Err(SyntaxError::invalid_empty_char(&span));
}
let input = input_tendril[..].as_bytes();
let mut i = 0;
let mut escaped = false;
let mut hex = false;
let mut unicode = false;
let mut characters: Vec<u8> = vec![];
while i < input.len() {
if !escaped {
if input[i] == b'{' {
i += 1;
characters.clear();
continue;
}
if input[i] == b'}' {
i += 1;
continue;
}
} else {
escaped = false;
characters.clear();
match input[i] {
b'0' => characters.push(0),
b't' => characters.push(9),
b'n' => characters.push(10),
b'r' => characters.push(13),
b'\"' => characters.push(34),
b'\'' => characters.push(39),
b'\\' => characters.push(92),
b'x' => {
hex = true;
i += 1;
continue;
}
b'u' => {
unicode = true;
}
_ => {
return Err(SyntaxError::invalid_escaped_char(input[i] as char, &span));
}
}
i += 1;
continue;
}
if input[i] == b'\\' {
escaped = true;
}
characters.push(input[i]);
i += 1;
}
return match characters.len() {
1 | 2 | 3 | 4 | 5 | 6 if unicode => {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(hex) = u32::from_str_radix(&string, 16) {
if hex <= 0x10FFFF {
if let Some(unicode_char) = std::char::from_u32(hex) {
return Ok(Expression::Value(ValueExpression::Char(unicode_char, span)));
}
}
}
}
Err(SyntaxError::invalid_unicode_char(characters, true, &span))
}
1 => {
if hex {
return Err(SyntaxError::invalid_hex_single_char(characters[0] as char, &span));
} else if escaped {
return Err(SyntaxError::invalid_escaped_char(characters[0] as char, &span));
}
Ok(Expression::Value(ValueExpression::Char(characters[0] as char, span)))
}
2 if hex => {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(number) = u8::from_str_radix(&string, 16) {
if number <= 127 {
return Ok(Expression::Value(ValueExpression::Char(number as char, span)));
}
}
}
Err(SyntaxError::invalid_hex_char(characters, &span))
}
3 | 4 => {
// direct unicode symbol
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Some(character) = string.chars().next() {
return Ok(Expression::Value(ValueExpression::Char(character, span)));
}
}
Err(SyntaxError::invalid_unicode_char(characters, false, &span))
}
_ => Err(SyntaxError::invalid_char(characters, &span)),
};
}
///
/// Returns an [`Expression`] AST node if the next token is a primary expression:
/// - Literals: field, group, unsigned integer, signed integer, boolean, address
@ -689,7 +803,7 @@ impl ParserContext {
Token::True => Expression::Value(ValueExpression::Boolean("true".into(), span)),
Token::False => Expression::Value(ValueExpression::Boolean("false".into(), span)),
Token::AddressLit(value) => Expression::Value(ValueExpression::Address(value, span)),
Token::CharLit(value) => Expression::Value(ValueExpression::Char(value, span)),
Token::CharLit(value) => self.parse_char(value, span)?,
Token::StringLiteral(value) => Expression::Value(ValueExpression::String(value, span)),
Token::LeftParen => self.parse_tuple_expression(&span)?,
Token::LeftSquare => self.parse_array_expression(&span)?,

View File

@ -164,132 +164,6 @@ impl Token {
(0, None)
}
///
/// Returns a new `StrTendril` string if an character can be eaten, otherwise returns [`None`].
///
fn eat_char(input_tendril: &StrTendril) -> (usize, Option<Token>) {
// Probably better to move this logic to a parse_char.
// Would give better errors, and isolates logic from lexer.
// Lexer can just return content between single quotes.
if input_tendril.is_empty() {
return (0, None);
}
let input = input_tendril[..].as_bytes();
let mut i = 1;
let mut escaped = false;
let mut hex = false;
let mut unicode = false;
let mut last = false;
let mut characters: Vec<u8> = vec![];
while i < input.len() {
if !escaped {
if input[i] == b'\'' {
last = true;
i += 1;
break;
}
if input[i] == b'{' {
i += 1;
characters.clear();
continue;
}
if input[i] == b'}' {
i += 1;
continue;
}
} else {
escaped = false;
characters.clear();
match input[i] {
b'0' => characters.push(0),
b't' => characters.push(9),
b'n' => characters.push(10),
b'r' => characters.push(13),
b'\"' => characters.push(34),
b'\'' => characters.push(39),
b'\\' => characters.push(92),
b'x' => {
hex = true;
i += 1;
continue;
}
b'u' => {
unicode = true;
}
_ => {
return (0, None);
}
}
i += 1;
continue;
}
if input[i] == b'\\' {
escaped = true;
}
characters.push(input[i]);
i += 1;
}
if !last {
return (0, None);
}
return match characters.len() {
1 | 2 | 3 | 4 | 5 if unicode => {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(hex) = u32::from_str_radix(&string, 16) {
if hex <= 0x10FFFF {
if let Some(unicode_char) = std::char::from_u32(hex) {
return (i, Some(Token::CharLit(unicode_char)));
}
}
}
}
(0, None)
}
1 => {
if hex {
return (0, None);
}
(i, Some(Token::CharLit(characters[0] as char)))
}
2 if hex => {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(number) = u8::from_str_radix(&string, 16) {
if number <= 127 {
return (i, Some(Token::CharLit(number as char)));
}
}
}
(0, None)
}
3 | 4 => {
// direct unicode symbol
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Some(character) = string.chars().next() {
return (i, Some(Token::CharLit(character)));
}
}
(0, None)
}
_ => (0, None),
};
}
///
/// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`].
/// An integer can be eaten if its bytes are at the front of the given `input_tendril` string.
@ -383,7 +257,23 @@ impl Token {
return (i + 1, Some(Token::FormatString(segments)));
}
b'\'' => {
return Self::eat_char(&input_tendril);
let mut i = 1;
let mut end = false;
while i < input.len() {
if input[i] == b'\'' {
end = true;
break;
}
i += 1;
}
if !end {
return (0, None);
}
return (i + 1, Some(Token::CharLit(input_tendril.subtendril(1, (i - 1) as u32))));
}
x if x.is_ascii_digit() => {
return Self::eat_integer(&input_tendril);

View File

@ -48,7 +48,7 @@ pub enum Token {
True,
False,
AddressLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril),
CharLit(char),
CharLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril),
At,

View File

@ -2,4 +2,4 @@
namespace: Compile
expectation: Fail
outputs:
- " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^\n |\n = unexpected token: '''"
- " --> compiler-test:4:23\n |\n 4 | const not_valid = '';\n | ^^\n |\n = Empty character '' is not valid"

View File

@ -4,12 +4,14 @@ expectation: Pass
outputs:
- "'a' @ 1:1-4"
- "'Z' @ 1:1-4"
- "'\"' @ 1:1-5"
- "'' @ 1:1-5"
- "'' @ 1:1-5"
- "'\u0000' @ 1:1-5"
- "'❤' @ 1:1-11"
- "'の' @ 1:1-11"
- "'\\\"' @ 1:1-5"
- "'\\t' @ 1:1-5"
- "'\\r' @ 1:1-5"
- "'\\0' @ 1:1-5"
- "'\\u{2764}' @ 1:1-11"
- "'\\u{306E}' @ 1:1-11"
- "'\\u{10FFFF}' @ 1:1-13"
- "'❤' @ 1:1-6"
- "'の' @ 1:1-6"
- "'*' @ 1:1-7"
- "'\\x0F' @ 1:1-7"
- "'\\x2A' @ 1:1-7"

View File

@ -2,7 +2,9 @@
namespace: ParseExpression
expectation: Fail
outputs:
- " --> test:1:1\n |\n 1 | '\\'\n | ^\n |\n = unexpected token: '''"
- " --> test:1:1\n |\n 1 | '\\'\n | ^^^\n |\n = Invalid escape character '\\\\'"
- " --> test:1:1\n |\n 1 | 'a\n | ^\n |\n = unexpected token: '''"
- " --> test:1:1\n |\n 1 | ''\n | ^\n |\n = unexpected token: '''"
- " --> test:1:1\n |\n 1 | '\\x9'\n | ^\n |\n = unexpected token: '''"
- " --> test:1:1\n |\n 1 | ''\n | ^^\n |\n = Empty character '' is not valid"
- " --> test:1:1\n |\n 1 | '\\x9'\n | ^^^^^\n |\n = Invalid singe hex character '\\x9', expected '\\x09"
- " --> test:1:1\n |\n 1 | '\\x80'\n | ^^^^^^\n |\n = Invalid singe hex character '\\x[56, 48]'"
- " --> test:1:1\n |\n 1 | '\\u{9999999}'\n | ^^^^^^^^^^^^^\n |\n = Invalid character '[57, 57, 57, 57, 57, 57, 57]'"

View File

@ -74,6 +74,15 @@ outputs:
col_stop: 11
path: test
content: "'\\u{306E}'"
- Value:
Char:
- 􏿿
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 13
path: test
content: "'\\u{10FFFF}'"
- Value:
Char:
- ❤
@ -92,6 +101,15 @@ outputs:
col_stop: 6
path: test
content: "'の'"
- Value:
Char:
- "\u000f"
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 7
path: test
content: "'\\x0F'"
- Value:
Char:
- "*"

View File

@ -127,8 +127,6 @@ outputs:
- - a
- a
- " "
- "`"
- " "
- "\\"
- " "
- "\""
@ -148,6 +146,6 @@ outputs:
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 31
col_stop: 28
path: test
content: "`aa \\` \\\\ \\\" \\n aa \\t \\r \\0`"
content: "`aa \\\\ \\\" \\n aa \\t \\r \\0`"

View File

@ -11,6 +11,8 @@ expectation: Pass
'\0'
'\u{2764}'
'\u{306E}'
'\u{10FFFF}'
'❤'
'の'
'\x0F'
'\x2A'

View File

@ -9,4 +9,8 @@ expectation: Fail
''
'\x9'
'\x9'
'\x80'
'\u{9999999}'

View File

@ -11,6 +11,8 @@ expectation: Pass
'\0'
'\u{2764}'
'\u{306E}'
'\u{10FFFF}'
'❤'
'の'
'\x0F'
'\x2A'

View File

@ -18,4 +18,4 @@ expectation: Pass
`\x0A`
`aa \` \\ \" \n aa \t \r \0`
`aa \\ \" \n aa \t \r \0`