add string type to tokenizer + lexer

This commit is contained in:
collin 2022-05-26 15:55:37 -04:00
parent d43605538d
commit 84c0a82008
6 changed files with 126 additions and 126 deletions

View File

@ -375,7 +375,7 @@ impl ParserContext<'_> {
Token::True => Expression::Value(ValueExpression::Boolean("true".into(), span)), Token::True => Expression::Value(ValueExpression::Boolean("true".into(), span)),
Token::False => Expression::Value(ValueExpression::Boolean("false".into(), span)), Token::False => Expression::Value(ValueExpression::Boolean("false".into(), span)),
Token::AddressLit(value) => Expression::Value(ValueExpression::Address(value, span)), Token::AddressLit(value) => Expression::Value(ValueExpression::Address(value, span)),
Token::StringLit(value) => Expression::Value(ValueExpression::String(value, span)), Token::StaticString(value) => Expression::Value(ValueExpression::String(value, span)),
Token::Ident(name) => { Token::Ident(name) => {
let ident = Identifier { name, span }; let ident = Identifier { name, span };
Expression::Identifier(ident) Expression::Identifier(ident)

View File

@ -154,7 +154,7 @@ impl ParserContext<'_> {
p.bump(); p.bump();
let SpannedToken { token, span } = p.prev_token.clone(); let SpannedToken { token, span } = p.prev_token.clone();
string = Some(match token { string = Some(match token {
Token::StringLit(chars) => chars, Token::StaticString(chars) => chars,
_ => { _ => {
p.emit_err(ParserError::unexpected_str(token, "formatted string", span)); p.emit_err(ParserError::unexpected_str(token, "formatted string", span));
Vec::new() Vec::new()

View File

@ -23,6 +23,7 @@ pub(super) const TYPE_TOKENS: &[Token] = &[
Token::Field, Token::Field,
Token::Group, Token::Group,
Token::Scalar, Token::Scalar,
Token::String,
Token::I8, Token::I8,
Token::I16, Token::I16,
Token::I32, Token::I32,
@ -64,6 +65,7 @@ impl ParserContext<'_> {
Token::Field => Type::Field, Token::Field => Type::Field,
Token::Group => Type::Group, Token::Group => Type::Group,
Token::Scalar => Type::Scalar, Token::Scalar => Type::Scalar,
Token::String => Type::String,
x => Type::IntegerType(Self::token_to_int_type(x).expect("invalid int type")), x => Type::IntegerType(Self::token_to_int_type(x).expect("invalid int type")),
}, },
span, span,

View File

@ -39,113 +39,114 @@ fn is_bidi_override(c: char) -> bool {
} }
impl Token { impl Token {
// Eats the parts of the unicode character after \u. // todo: remove this unused code or reference https://github.com/Geal/nom/blob/main/examples/string.rs
fn eat_unicode_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> { // // Eats the parts of the unicode character after \u.
let mut unicode = String::new(); // fn eat_unicode_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
// Account for the chars '\' and 'u'. // let mut unicode = String::new();
let mut len = 2; // // Account for the chars '\' and 'u'.
// let mut len = 2;
//
// if input.next_if_eq(&'{').is_some() {
// len += 1;
// } else if let Some(c) = input.next() {
// return Err(ParserError::lexer_unopened_escaped_unicode_char(c).into());
// } else {
// return Err(ParserError::lexer_empty_input_tendril().into());
// }
//
// while let Some(c) = input.next_if(|c| c != &'}') {
// len += 1;
// unicode.push(c);
// }
//
// if input.next_if_eq(&'}').is_some() {
// len += 1;
// } else {
// return Err(ParserError::lexer_unclosed_escaped_unicode_char(unicode).into());
// }
//
// // Max of 6 digits.
// // Minimum of 1 digit.
// if unicode.len() > 6 || unicode.is_empty() {
// return Err(ParserError::lexer_invalid_escaped_unicode_length(unicode).into());
// }
//
// if let Ok(hex) = u32::from_str_radix(&unicode, 16) {
// if let Some(character) = std::char::from_u32(hex) {
// Ok((len, Char::Scalar(character)))
// } else if hex <= 0x10FFFF {
// Ok((len, Char::NonScalar(hex)))
// } else {
// Err(ParserError::lexer_invalid_character_exceeded_max_value(unicode).into())
// }
// } else {
// Err(ParserError::lexer_expected_valid_hex_char(unicode).into())
// }
// }
if input.next_if_eq(&'{').is_some() { // // Eats the parts of the hex character after \x.
len += 1; // fn eat_hex_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
} else if let Some(c) = input.next() { // let mut hex = String::new();
return Err(ParserError::lexer_unopened_escaped_unicode_char(c).into()); // // Account for the chars '\' and 'x'.
} else { // let mut len = 2;
return Err(ParserError::lexer_empty_input_tendril().into()); //
} // // First hex character.
// if let Some(c) = input.next_if(|c| c != &'\'') {
// len += 1;
// hex.push(c);
// } else if let Some(c) = input.next() {
// return Err(ParserError::lexer_expected_valid_hex_char(c).into());
// } else {
// return Err(ParserError::lexer_empty_input_tendril().into());
// }
//
// // Second hex character.
// if let Some(c) = input.next_if(|c| c != &'\'') {
// len += 1;
// hex.push(c);
// } else if let Some(c) = input.next() {
// return Err(ParserError::lexer_expected_valid_hex_char(c).into());
// } else {
// return Err(ParserError::lexer_empty_input_tendril().into());
// }
//
// if let Ok(ascii_number) = u8::from_str_radix(&hex, 16) {
// // According to RFC, we allow only values less than 128.
// if ascii_number > 127 {
// return Err(ParserError::lexer_expected_valid_hex_char(hex).into());
// }
//
// Ok((len, Char::Scalar(ascii_number as char)))
// } else {
// Err(ParserError::lexer_expected_valid_hex_char(hex).into())
// }
// }
while let Some(c) = input.next_if(|c| c != &'}') { // fn eat_escaped_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
len += 1; // match input.next() {
unicode.push(c); // None => Err(ParserError::lexer_empty_input_tendril().into()),
} // // Length of 2 to account the '\'.
// Some('0') => Ok((2, Char::Scalar(0 as char))),
// Some('t') => Ok((2, Char::Scalar(9 as char))),
// Some('n') => Ok((2, Char::Scalar(10 as char))),
// Some('r') => Ok((2, Char::Scalar(13 as char))),
// Some('\"') => Ok((2, Char::Scalar(34 as char))),
// Some('\'') => Ok((2, Char::Scalar(39 as char))),
// Some('\\') => Ok((2, Char::Scalar(92 as char))),
// Some('u') => Self::eat_unicode_char(input),
// Some('x') => Self::eat_hex_char(input),
// Some(c) => Err(ParserError::lexer_expected_valid_escaped_char(c).into()),
// }
// }
if input.next_if_eq(&'}').is_some() { // /// Returns a `char` if a character can be eaten, otherwise returns [`None`].
len += 1; // fn eat_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
} else { // match input.next() {
return Err(ParserError::lexer_unclosed_escaped_unicode_char(unicode).into()); // None => Err(ParserError::lexer_empty_input_tendril().into()),
} // Some('\\') => Self::eat_escaped_char(input),
// Some(c) => Ok((c.len_utf8(), Char::Scalar(c))),
// Max of 6 digits. // }
// Minimum of 1 digit. // }
if unicode.len() > 6 || unicode.is_empty() {
return Err(ParserError::lexer_invalid_escaped_unicode_length(unicode).into());
}
if let Ok(hex) = u32::from_str_radix(&unicode, 16) {
if let Some(character) = std::char::from_u32(hex) {
Ok((len, Char::Scalar(character)))
} else if hex <= 0x10FFFF {
Ok((len, Char::NonScalar(hex)))
} else {
Err(ParserError::lexer_invalid_character_exceeded_max_value(unicode).into())
}
} else {
Err(ParserError::lexer_expected_valid_hex_char(unicode).into())
}
}
// Eats the parts of the hex character after \x.
fn eat_hex_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
let mut hex = String::new();
// Account for the chars '\' and 'x'.
let mut len = 2;
// First hex character.
if let Some(c) = input.next_if(|c| c != &'\'') {
len += 1;
hex.push(c);
} else if let Some(c) = input.next() {
return Err(ParserError::lexer_expected_valid_hex_char(c).into());
} else {
return Err(ParserError::lexer_empty_input_tendril().into());
}
// Second hex character.
if let Some(c) = input.next_if(|c| c != &'\'') {
len += 1;
hex.push(c);
} else if let Some(c) = input.next() {
return Err(ParserError::lexer_expected_valid_hex_char(c).into());
} else {
return Err(ParserError::lexer_empty_input_tendril().into());
}
if let Ok(ascii_number) = u8::from_str_radix(&hex, 16) {
// According to RFC, we allow only values less than 128.
if ascii_number > 127 {
return Err(ParserError::lexer_expected_valid_hex_char(hex).into());
}
Ok((len, Char::Scalar(ascii_number as char)))
} else {
Err(ParserError::lexer_expected_valid_hex_char(hex).into())
}
}
fn eat_escaped_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
match input.next() {
None => Err(ParserError::lexer_empty_input_tendril().into()),
// Length of 2 to account the '\'.
Some('0') => Ok((2, Char::Scalar(0 as char))),
Some('t') => Ok((2, Char::Scalar(9 as char))),
Some('n') => Ok((2, Char::Scalar(10 as char))),
Some('r') => Ok((2, Char::Scalar(13 as char))),
Some('\"') => Ok((2, Char::Scalar(34 as char))),
Some('\'') => Ok((2, Char::Scalar(39 as char))),
Some('\\') => Ok((2, Char::Scalar(92 as char))),
Some('u') => Self::eat_unicode_char(input),
Some('x') => Self::eat_hex_char(input),
Some(c) => Err(ParserError::lexer_expected_valid_escaped_char(c).into()),
}
}
/// Returns a `char` if a character can be eaten, otherwise returns [`None`].
fn eat_char(input: &mut Peekable<impl Iterator<Item = char>>) -> Result<(usize, Char)> {
match input.next() {
None => Err(ParserError::lexer_empty_input_tendril().into()),
Some('\\') => Self::eat_escaped_char(input),
Some(c) => Ok((c.len_utf8(), Char::Scalar(c))),
}
}
/// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`]. /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`].
/// An integer can be eaten if its bytes are at the front of the given `input_tendril` string. /// An integer can be eaten if its bytes are at the front of the given `input_tendril` string.
@ -183,27 +184,25 @@ impl Token {
return Ok((1, Token::WhiteSpace)); return Ok((1, Token::WhiteSpace));
} }
Some('"') => { Some('"') => {
let mut string: Vec<leo_ast::Char> = Vec::new(); let mut string = String::from("\"");
input.next();
let mut len = 0; let mut ended = false;
while let Some(c) = input.peek() { while let Some(c) = input.next() {
if is_bidi_override(*c) { if is_bidi_override(c) {
return Err(ParserError::lexer_bidi_override().into()); return Err(ParserError::lexer_bidi_override().into());
} }
if c == &'"' { string.push(c);
if c == '"' {
ended = true;
break; break;
} }
let (char_len, character) = Self::eat_char(&mut input)?;
len += char_len;
string.push(character.into());
} }
if input.next_if_eq(&'"').is_some() { if !ended {
return Ok((len + 2, Token::StringLit(string))); return Err(ParserError::lexer_string_not_closed(string).into());
} }
return Err(ParserError::lexer_string_not_closed(leo_ast::Chars(string)).into()); return Ok((string.len(), Token::StaticString(string)));
} }
Some(x) if x.is_ascii_digit() => { Some(x) if x.is_ascii_digit() => {
return Self::eat_integer(&mut input); return Self::eat_integer(&mut input);

View File

@ -50,7 +50,7 @@ pub enum Token {
// Literals // Literals
CommentLine(String), CommentLine(String),
CommentBlock(String), CommentBlock(String),
StringLit(Vec<leo_ast::Char>), StaticString(String),
Ident(Symbol), Ident(Symbol),
Int(String), Int(String),
True, True,
@ -96,6 +96,7 @@ pub enum Token {
Field, Field,
Group, Group,
Scalar, Scalar,
String,
I8, I8,
I16, I16,
I32, I32,
@ -150,6 +151,7 @@ pub const KEYWORD_TOKENS: &[Token] = &[
Token::Public, Token::Public,
Token::Return, Token::Return,
Token::Scalar, Token::Scalar,
Token::String,
Token::True, Token::True,
Token::U8, Token::U8,
Token::U16, Token::U16,
@ -189,6 +191,7 @@ impl Token {
Token::Public => sym::Public, Token::Public => sym::Public,
Token::Return => sym::Return, Token::Return => sym::Return,
Token::Scalar => sym::scalar, Token::Scalar => sym::scalar,
Token::String => sym::string,
Token::True => sym::True, Token::True => sym::True,
Token::U8 => sym::u8, Token::U8 => sym::u8,
Token::U16 => sym::u16, Token::U16 => sym::u16,
@ -206,13 +209,7 @@ impl fmt::Display for Token {
match self { match self {
CommentLine(s) => write!(f, "{}", s), CommentLine(s) => write!(f, "{}", s),
CommentBlock(s) => write!(f, "{}", s), CommentBlock(s) => write!(f, "{}", s),
StringLit(string) => { StaticString(s) => write!(f, "{}", s),
write!(f, "\"")?;
for character in string.iter() {
write!(f, "{}", character)?;
}
write!(f, "\"")
}
Ident(s) => write!(f, "{}", s), Ident(s) => write!(f, "{}", s),
Int(s) => write!(f, "{}", s), Int(s) => write!(f, "{}", s),
True => write!(f, "true"), True => write!(f, "true"),
@ -255,6 +252,7 @@ impl fmt::Display for Token {
Field => write!(f, "field"), Field => write!(f, "field"),
Group => write!(f, "group"), Group => write!(f, "group"),
Scalar => write!(f, "scalar"), Scalar => write!(f, "scalar"),
String => write!(f, "string"),
I8 => write!(f, "i8"), I8 => write!(f, "i8"),
I16 => write!(f, "i16"), I16 => write!(f, "i16"),
I32 => write!(f, "i32"), I32 => write!(f, "i32"),

View File

@ -137,6 +137,7 @@ symbols! {
scalar, scalar,
Star: "*", Star: "*",
std, std,
string,
Struct: "struct", Struct: "struct",
test, test,
True: "true", True: "true",