From 6e71c8ab4f8d3f244f5aaeada6e786f18f0edaf7 Mon Sep 17 00:00:00 2001 From: gluax Date: Thu, 13 May 2021 14:33:01 -0400 Subject: [PATCH] addex hex, octal, and direct unicode --- asg/src/expression/constant.rs | 9 +- ast/src/expression/value.rs | 2 +- compiler/src/phases/reducing_director.rs | 5 +- parser/src/tokenizer/lexer.rs | 189 +++++++++++++++--- parser/src/tokenizer/token.rs | 2 +- tests/parser/expression/literal/char.leo | 18 +- .../parser/expression/literal/char_parse.leo | 10 +- 7 files changed, 190 insertions(+), 45 deletions(-) diff --git a/asg/src/expression/constant.rs b/asg/src/expression/constant.rs index 157812052b..f6fcf333fa 100644 --- a/asg/src/expression/constant.rs +++ b/asg/src/expression/constant.rs @@ -129,14 +129,11 @@ impl<'a> FromAst<'a, leo_ast::ValueExpression> for Constant<'a> { )); } } + Constant { parent: Cell::new(None), span: Some(span.clone()), - value: ConstValue::Char( - value - .parse::() - .map_err(|_| AsgConvertError::invalid_char(&value, span))?, - ), + value: ConstValue::Char(*value), } } Field(value, span) => { @@ -237,7 +234,7 @@ impl<'a> Into for &Constant<'a> { leo_ast::ValueExpression::Boolean(value.to_string().into(), self.span.clone().unwrap_or_default()) } ConstValue::Char(value) => { - leo_ast::ValueExpression::Char(value.to_string().into(), self.span.clone().unwrap_or_default()) + leo_ast::ValueExpression::Char(value.clone(), self.span.clone().unwrap_or_default()) } ConstValue::Field(value) => { leo_ast::ValueExpression::Field(value.to_string().into(), self.span.clone().unwrap_or_default()) diff --git a/ast/src/expression/value.rs b/ast/src/expression/value.rs index 14bd2ca687..865df9235a 100644 --- a/ast/src/expression/value.rs +++ b/ast/src/expression/value.rs @@ -24,7 +24,7 @@ pub enum ValueExpression { // todo: deserialize values here Address(#[serde(with = "crate::common::tendril_json")] StrTendril, Span), Boolean(#[serde(with = "crate::common::tendril_json")] StrTendril, Span), - Char(#[serde(with = "crate::common::tendril_json")] StrTendril, Span), + Char(char, Span), Field(#[serde(with = "crate::common::tendril_json")] StrTendril, Span), Group(Box), Implicit(#[serde(with = "crate::common::tendril_json")] StrTendril, Span), diff --git a/compiler/src/phases/reducing_director.rs b/compiler/src/phases/reducing_director.rs index bfc4fd073b..0bec948eb3 100644 --- a/compiler/src/phases/reducing_director.rs +++ b/compiler/src/phases/reducing_director.rs @@ -434,7 +434,10 @@ impl CombineAstAsgDirector { new = ValueExpression::Boolean(tendril.clone(), span.clone()); } ConstValue::Char(_) => { - new = ValueExpression::Char(tendril.clone(), span.clone()); + if let Some(c) = tendril.chars().next() { + new = ValueExpression::Char(c, span.clone()); + } + // TODO RETURN ERR } _ => unimplemented!(), // impossible? } diff --git a/parser/src/tokenizer/lexer.rs b/parser/src/tokenizer/lexer.rs index f318735164..3e4aa64d51 100644 --- a/parser/src/tokenizer/lexer.rs +++ b/parser/src/tokenizer/lexer.rs @@ -151,53 +151,184 @@ impl Token { return (i + 1, Some(Token::FormatString(segments))); } b'\'' => { - if input[1] == b'\'' { - return (0, None); - } - let mut i = 1; - let mut in_escape = false; - let mut character = String::new(); + let mut escaped = false; + let mut hex = false; + let mut octal = false; + let mut unicode = true; + let mut characters: Vec = vec![]; + while i < input.len() { - if !in_escape { + if !escaped { if input[i] == b'\'' { + i += 1; break; } - if input[i] == b'\\' { - in_escape = !in_escape; - } else { - character.push(input[i] as char); + + if input[i] == b'{' { + i += 1; + characters.clear(); + continue; + } + + if input[i] == b'}' { + i += 1; + continue; } } else { - in_escape = false; - if input[i] == b'u' { - i += 2; - let mut j = i; - let mut size = 0; - while input[j] != b'}' { - j += 1; - size += 1; - } - let hex_string_number: String = input_tendril.subtendril(i as u32, size).to_string(); - if let Ok(hex) = u32::from_str_radix(&hex_string_number, 16) { - if let Some(unicode) = std::char::from_u32(hex) { - i = j; - character = unicode.to_string(); + escaped = false; + characters.clear(); + + match input[i] { + b'0' => characters.push(0), + b't' => characters.push(9), + b'n' => characters.push(10), + b'r' => characters.push(13), + b'\"' => characters.push(34), + b'\'' => characters.push(39), + b'\\' => characters.push(92), + b'x' => { + i += 1; + match input[i] { + b'H' => { + hex = true; + } + b'O' => { + octal = true; + } + _ => { + return (0, None); + } } - } else { + + i += 1; + continue; + } + b'u' => { + unicode = true; + } + _ => { return (0, None); } - } else { - character.push(input[i] as char); } + + i += 1; + continue; } + + if input[i] == b'\\' { + escaped = true; + } + + characters.push(input[i]); i += 1; } + if i == input.len() { return (0, None); } - return (i + 1, Some(Token::CharLit(character.into()))); + return match characters.len() { + 1 => { + if hex { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(number) = u8::from_str_radix(&string, 16) { + if number < 127 { + return (i, Some(Token::CharLit(number as char))); + } + } + } + } + + (i, Some(Token::CharLit(characters[0] as char))) + } + 2 => { + if hex { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(number) = u8::from_str_radix(&string, 16) { + if number < 127 { + return (i, Some(Token::CharLit(number as char))); + } + } + } + } + + if unicode { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Some(character) = string.chars().next() { + return (i, Some(Token::CharLit(character))); + } + } + } + + (0, None) + } + 3 => { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if octal { + if let Ok(number) = u8::from_str_radix(&string, 8) { + if number < 127 { + return (i, Some(Token::CharLit(number as char))); + } + } + } + + if let Some(character) = string.chars().next() { + return (i, Some(Token::CharLit(character))); + } + } + + (0, None) + } + 4 | 5 | 6 => { + if let Ok(unicode_string) = std::str::from_utf8(&characters[..]) { + if let Ok(hex) = u32::from_str_radix(&unicode_string, 16) { + if let Some(unicode_char) = std::char::from_u32(hex) { + return (i, Some(Token::CharLit(unicode_char))); + } + } + } + + (0, None) + } + _ => (0, None), + }; + + // while i < input.len() { + // if !in_escape { + // if input[i] == b'\'' { + // break; + // } + // if input[i] == b'\\' { + // in_escape = !in_escape; + // } else { + // character.push(input[i] as char); + // } + // } else { + // in_escape = false; + // if input[i] == b'u' { + // i += 2; + // let mut j = i; + // let mut size = 0; + // while input[j] != b'}' { + // j += 1; + // size += 1; + // } + // let hex_string_number: String = input_tendril.subtendril(i as u32, size).to_string(); + // if let Ok(hex) = u32::from_str_radix(&hex_string_number, 16) { + // if let Some(unicode) = std::char::from_u32(hex) { + // i = j; + // character = unicode.to_string(); + // } + // } else { + // return (0, None); + // } + // } else { + // character.push(input[i] as char); + // } + // } + // i += 1; + // } } x if x.is_ascii_digit() => { return Self::eat_integer(&input_tendril); diff --git a/parser/src/tokenizer/token.rs b/parser/src/tokenizer/token.rs index 5863041ae9..d7da426589 100644 --- a/parser/src/tokenizer/token.rs +++ b/parser/src/tokenizer/token.rs @@ -47,7 +47,7 @@ pub enum Token { True, False, AddressLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), - CharLit(#[serde(with = "leo_ast::common::tendril_json")] StrTendril), + CharLit(char), At, diff --git a/tests/parser/expression/literal/char.leo b/tests/parser/expression/literal/char.leo index d2719e893a..013de7f106 100644 --- a/tests/parser/expression/literal/char.leo +++ b/tests/parser/expression/literal/char.leo @@ -4,8 +4,16 @@ expectation: Pass */ 'a' -'A' -'\'' -'\\' -'\n' -'\u{2764}' \ No newline at end of file +'Z' +'\"' +'\t' +'\r' +'\0' +'\u{2764}' +'\u{306E}' +'❤' +'の' +'\xH2A' +'\xH9' +'\xO011' +'\xO172' \ No newline at end of file diff --git a/tests/parser/expression/literal/char_parse.leo b/tests/parser/expression/literal/char_parse.leo index e784384f51..a0388a3a48 100644 --- a/tests/parser/expression/literal/char_parse.leo +++ b/tests/parser/expression/literal/char_parse.leo @@ -4,9 +4,15 @@ expectation: Pass */ 'a' -'b' +'Z' '\"' '\t' '\r' '\0' -'\u{2764}' \ No newline at end of file +'\u{2764}' +'\u{306E}' +'❤' +'\xH2A' +'\xH9' +'\xO011' +'\xO172' \ No newline at end of file