diff --git a/compiler/tests/char/hex.leo b/compiler/tests/char/hex.leo index 0cd61be0f5..2cbc144783 100644 --- a/compiler/tests/char/hex.leo +++ b/compiler/tests/char/hex.leo @@ -1,4 +1,4 @@ function main() { - const heart: char = '\u{2764}'; - const Hiragana = '\u{306E}'; + const heart: char = '❤'; + const Hiragana = 'の'; } \ No newline at end of file diff --git a/compiler/tests/char/mod.rs b/compiler/tests/char/mod.rs index 7da9156969..21e872ef45 100644 --- a/compiler/tests/char/mod.rs +++ b/compiler/tests/char/mod.rs @@ -42,6 +42,16 @@ fn test_basic() { assert_satisfied(program); } +#[test] +fn test_circuit() { + let program_string = include_str!("circuit.leo"); + let char_input_string = include_str!("input/char.in"); + + let program = parse_program_with_input(program_string, char_input_string).unwrap(); + + assert_satisfied(program); +} + #[test] fn test_escapes() { let program_string = include_str!("escapes.leo"); @@ -67,11 +77,17 @@ fn test_function() { } #[test] -fn test_circuit() { - let program_string = include_str!("circuit.leo"); - let char_input_string = include_str!("input/char.in"); - - let program = parse_program_with_input(program_string, char_input_string).unwrap(); +fn test_octal() { + let program_string = include_str!("octal.leo"); + let program = parse_program(program_string).unwrap(); + + assert_satisfied(program); +} + +#[test] +fn test_unicode() { + let program_string = include_str!("unicode.leo"); + let program = parse_program(program_string).unwrap(); assert_satisfied(program); } diff --git a/compiler/tests/char/octal.leo b/compiler/tests/char/octal.leo new file mode 100644 index 0000000000..8591646ba9 --- /dev/null +++ b/compiler/tests/char/octal.leo @@ -0,0 +1,4 @@ +function main() { + const tab: char = '\xO011'; + const z = '\xO172'; +} \ No newline at end of file diff --git a/compiler/tests/char/unicode.leo b/compiler/tests/char/unicode.leo new file mode 100644 index 0000000000..0cd61be0f5 --- /dev/null +++ b/compiler/tests/char/unicode.leo @@ -0,0 +1,4 @@ +function main() { + const heart: char = '\u{2764}'; + const Hiragana = '\u{306E}'; +} \ No newline at end of file diff --git a/input/src/leo-input.pest b/input/src/leo-input.pest index 1fadc16f20..99874fd882 100644 --- a/input/src/leo-input.pest +++ b/input/src/leo-input.pest @@ -137,10 +137,15 @@ number_positive = @{ ASCII_DIGIT+ } // ANY is equivalent to '\u{00}'..'\u{10FFFF}' basic_char = { ANY } escaped_char = @{ "\\" ~ ("\"" | "\'" | "\\" | "/" | "b" | "f" | "n" | "r" | "t") } -hex_char = @{ "\\" ~ "u" ~ "{" ~ ASCII_HEX_DIGIT{4} ~ "}" } +hex_char = @{ "\\" ~ "x" ~ "H" ~ ASCII_HEX_DIGIT{1, 2} } +octal_char = @{ "\\" ~ "x" ~ "O" ~ ASCII_DIGIT{3} } +unicode_char = @{ "\\" ~ "u" ~ "{" ~ ASCII_HEX_DIGIT{1, 6} ~ "}" } + char_types = { escaped_char + | unicode_char | hex_char + | octal_char | basic_char } diff --git a/input/src/values/char_types.rs b/input/src/values/char_types.rs index 33bc1e621b..5c0a56f3a5 100644 --- a/input/src/values/char_types.rs +++ b/input/src/values/char_types.rs @@ -49,12 +49,32 @@ pub struct HexChar<'ast> { pub span: Span<'ast>, } +#[derive(Clone, Debug, FromPest, PartialEq, Eq)] +#[pest_ast(rule(Rule::octal_char))] +pub struct OctalChar<'ast> { + #[pest_ast(outer(with(span_into_string)))] + pub value: String, + #[pest_ast(outer())] + pub span: Span<'ast>, +} + +#[derive(Clone, Debug, FromPest, PartialEq, Eq)] +#[pest_ast(rule(Rule::unicode_char))] +pub struct UnicodeChar<'ast> { + #[pest_ast(outer(with(span_into_string)))] + pub value: String, + #[pest_ast(outer())] + pub span: Span<'ast>, +} + #[derive(Clone, Debug, FromPest, PartialEq, Eq)] #[pest_ast(rule(Rule::char_types))] pub enum CharTypes<'ast> { Basic(BasicChar<'ast>), Escaped(EscapedChar<'ast>), Hex(HexChar<'ast>), + Octal(OctalChar<'ast>), + Unicode(UnicodeChar<'ast>), } impl<'ast> CharTypes<'ast> { @@ -75,8 +95,28 @@ impl<'ast> CharTypes<'ast> { Err(InputParserError::invalid_char(character.value, &character.span)) } Self::Hex(character) => { - let hex_string_number = character.value[3..=6].to_string(); - if let Ok(hex) = u32::from_str_radix(&hex_string_number, 16) { + let hex_string_number = character.value[3..character.value.len()].to_string(); + if let Ok(number) = u8::from_str_radix(&hex_string_number, 16) { + if number < 127 { + return Ok(number as char); + } + } + + Err(InputParserError::invalid_char(character.value, &character.span)) + } + Self::Octal(character) => { + let octal_string_number = character.value[3..character.value.len()].to_string(); + if let Ok(number) = u8::from_str_radix(&octal_string_number, 8) { + if number < 127 { + return Ok(number as char); + } + } + + Err(InputParserError::invalid_char(character.value, &character.span)) + } + Self::Unicode(character) => { + let unicode_string_number = character.value[3..=character.value.len() - 2].to_string(); + if let Ok(hex) = u32::from_str_radix(&unicode_string_number, 16) { if let Some(unicode) = std::char::from_u32(hex) { return Ok(unicode); } diff --git a/parser/src/tokenizer/lexer.rs b/parser/src/tokenizer/lexer.rs index 3e4aa64d51..e55ac78e08 100644 --- a/parser/src/tokenizer/lexer.rs +++ b/parser/src/tokenizer/lexer.rs @@ -61,6 +61,162 @@ fn eat_identifier(input_tendril: &StrTendril) -> Option { } impl Token { + /// + /// Returns a new `StrTendril` string if an character can be eaten, otherwise returns [`None`]. + /// + fn eat_char(input_tendril: &StrTendril) -> (usize, Option) { + if input_tendril.is_empty() { + return (0, None); + } + + let input = input_tendril[..].as_bytes(); + let mut i = 1; + let mut escaped = false; + let mut hex = false; + let mut octal = false; + let mut unicode = false; + let mut last = false; + let mut characters: Vec = vec![]; + + while i < input.len() { + if !escaped { + if input[i] == b'\'' { + last = true; + i += 1; + break; + } + + if input[i] == b'{' { + i += 1; + characters.clear(); + continue; + } + + if input[i] == b'}' { + i += 1; + continue; + } + } else { + escaped = false; + characters.clear(); + + match input[i] { + b'0' => characters.push(0), + b't' => characters.push(9), + b'n' => characters.push(10), + b'r' => characters.push(13), + b'\"' => characters.push(34), + b'\'' => characters.push(39), + b'\\' => characters.push(92), + b'x' => { + i += 1; + match input[i] { + b'H' => { + hex = true; + } + b'O' => { + octal = true; + } + _ => { + return (0, None); + } + } + + i += 1; + continue; + } + b'u' => { + unicode = true; + } + _ => { + return (0, None); + } + } + + i += 1; + + continue; + } + + if input[i] == b'\\' { + escaped = true; + } + + characters.push(input[i]); + i += 1; + } + + if !last { + return (0, None); + } + + return match characters.len() { + 1 => { + if hex { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(number) = u8::from_str_radix(&string, 16) { + if number < 127 { + return (i, Some(Token::CharLit(number as char))); + } + } + } + } + + (i, Some(Token::CharLit(characters[0] as char))) + } + 2 => { + if hex { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Ok(number) = u8::from_str_radix(&string, 16) { + if number < 127 { + return (i, Some(Token::CharLit(number as char))); + } + } + } + } + + if unicode { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if let Some(character) = string.chars().next() { + return (i, Some(Token::CharLit(character))); + } + } + } + + (0, None) + } + 3 => { + if let Ok(string) = std::str::from_utf8(&characters[..]) { + if octal { + if let Ok(number) = u8::from_str_radix(&string, 8) { + if number < 127 { + return (i, Some(Token::CharLit(number as char))); + } + } + } + + if let Some(character) = string.chars().next() { + return (i, Some(Token::CharLit(character))); + } + } + + (0, None) + } + 4 | 5 | 6 => { + if let Ok(unicode_string) = std::str::from_utf8(&characters[..]) { + if let Ok(hex) = u32::from_str_radix(&unicode_string, 16) { + if let Some(unicode_char) = std::char::from_u32(hex) { + return (i, Some(Token::CharLit(unicode_char))); + } + } + } + + (0, None) + } + _ => (0, None), + }; + } + /// /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`]. /// An integer can be eaten if its bytes are at the front of the given `input_tendril` string. @@ -151,184 +307,7 @@ impl Token { return (i + 1, Some(Token::FormatString(segments))); } b'\'' => { - let mut i = 1; - let mut escaped = false; - let mut hex = false; - let mut octal = false; - let mut unicode = true; - let mut characters: Vec = vec![]; - - while i < input.len() { - if !escaped { - if input[i] == b'\'' { - i += 1; - break; - } - - if input[i] == b'{' { - i += 1; - characters.clear(); - continue; - } - - if input[i] == b'}' { - i += 1; - continue; - } - } else { - escaped = false; - characters.clear(); - - match input[i] { - b'0' => characters.push(0), - b't' => characters.push(9), - b'n' => characters.push(10), - b'r' => characters.push(13), - b'\"' => characters.push(34), - b'\'' => characters.push(39), - b'\\' => characters.push(92), - b'x' => { - i += 1; - match input[i] { - b'H' => { - hex = true; - } - b'O' => { - octal = true; - } - _ => { - return (0, None); - } - } - - i += 1; - continue; - } - b'u' => { - unicode = true; - } - _ => { - return (0, None); - } - } - - i += 1; - continue; - } - - if input[i] == b'\\' { - escaped = true; - } - - characters.push(input[i]); - i += 1; - } - - if i == input.len() { - return (0, None); - } - - return match characters.len() { - 1 => { - if hex { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(number) = u8::from_str_radix(&string, 16) { - if number < 127 { - return (i, Some(Token::CharLit(number as char))); - } - } - } - } - - (i, Some(Token::CharLit(characters[0] as char))) - } - 2 => { - if hex { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Ok(number) = u8::from_str_radix(&string, 16) { - if number < 127 { - return (i, Some(Token::CharLit(number as char))); - } - } - } - } - - if unicode { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if let Some(character) = string.chars().next() { - return (i, Some(Token::CharLit(character))); - } - } - } - - (0, None) - } - 3 => { - if let Ok(string) = std::str::from_utf8(&characters[..]) { - if octal { - if let Ok(number) = u8::from_str_radix(&string, 8) { - if number < 127 { - return (i, Some(Token::CharLit(number as char))); - } - } - } - - if let Some(character) = string.chars().next() { - return (i, Some(Token::CharLit(character))); - } - } - - (0, None) - } - 4 | 5 | 6 => { - if let Ok(unicode_string) = std::str::from_utf8(&characters[..]) { - if let Ok(hex) = u32::from_str_radix(&unicode_string, 16) { - if let Some(unicode_char) = std::char::from_u32(hex) { - return (i, Some(Token::CharLit(unicode_char))); - } - } - } - - (0, None) - } - _ => (0, None), - }; - - // while i < input.len() { - // if !in_escape { - // if input[i] == b'\'' { - // break; - // } - // if input[i] == b'\\' { - // in_escape = !in_escape; - // } else { - // character.push(input[i] as char); - // } - // } else { - // in_escape = false; - // if input[i] == b'u' { - // i += 2; - // let mut j = i; - // let mut size = 0; - // while input[j] != b'}' { - // j += 1; - // size += 1; - // } - // let hex_string_number: String = input_tendril.subtendril(i as u32, size).to_string(); - // if let Ok(hex) = u32::from_str_radix(&hex_string_number, 16) { - // if let Some(unicode) = std::char::from_u32(hex) { - // i = j; - // character = unicode.to_string(); - // } - // } else { - // return (0, None); - // } - // } else { - // character.push(input[i] as char); - // } - // } - // i += 1; - // } + return Self::eat_char(&input_tendril); } x if x.is_ascii_digit() => { return Self::eat_integer(&input_tendril); diff --git a/tests/expectations/parser/parser/expression/literal/char.leo.out b/tests/expectations/parser/parser/expression/literal/char.leo.out index 126cebd1b8..0370692c2e 100644 --- a/tests/expectations/parser/parser/expression/literal/char.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char.leo.out @@ -3,8 +3,16 @@ namespace: Token expectation: Pass outputs: - "'a' @ 1:1-4" - - "'A' @ 1:1-4" - - "''' @ 1:1-5" - - "'\\' @ 1:1-5" - - "'n' @ 1:1-5" + - "'Z' @ 1:1-4" + - "'\"' @ 1:1-5" + - "'' @ 1:1-5" + - "'' @ 1:1-5" + - "'\u0000' @ 1:1-5" - "'❤' @ 1:1-11" + - "'の' @ 1:1-11" + - "'❤' @ 1:1-6" + - "'の' @ 1:1-6" + - "'*' @ 1:1-8" + - "'' @ 1:1-7" + - "'' @ 1:1-9" + - "'z' @ 1:1-9" diff --git a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out index 4b4e1aa952..bcbbb81ddd 100644 --- a/tests/expectations/parser/parser/expression/literal/char_parse.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_parse.leo.out @@ -13,13 +13,13 @@ outputs: content: "'a'" - Value: Char: - - b + - Z - line_start: 1 line_stop: 1 col_start: 1 col_stop: 4 path: test - content: "'b'" + content: "'Z'" - Value: Char: - "\"" @@ -31,7 +31,7 @@ outputs: content: "'\\\"'" - Value: Char: - - t + - "\t" - line_start: 1 line_stop: 1 col_start: 1 @@ -40,7 +40,7 @@ outputs: content: "'\\t'" - Value: Char: - - r + - "\r" - line_start: 1 line_stop: 1 col_start: 1 @@ -49,7 +49,7 @@ outputs: content: "'\\r'" - Value: Char: - - "0" + - "\u0000" - line_start: 1 line_stop: 1 col_start: 1 @@ -65,3 +65,66 @@ outputs: col_stop: 11 path: test content: "'\\u{2764}'" + - Value: + Char: + - の + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 11 + path: test + content: "'\\u{306E}'" + - Value: + Char: + - ❤ + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 6 + path: test + content: "'❤'" + - Value: + Char: + - の + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 6 + path: test + content: "'の'" + - Value: + Char: + - "*" + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 8 + path: test + content: "'\\xH2A'" + - Value: + Char: + - "\t" + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 7 + path: test + content: "'\\xH9'" + - Value: + Char: + - "\t" + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 9 + path: test + content: "'\\xO011'" + - Value: + Char: + - z + - line_start: 1 + line_stop: 1 + col_start: 1 + col_stop: 9 + path: test + content: "'\\xO172'" diff --git a/tests/parser/expression/literal/char_parse.leo b/tests/parser/expression/literal/char_parse.leo index a0388a3a48..1bb25c489e 100644 --- a/tests/parser/expression/literal/char_parse.leo +++ b/tests/parser/expression/literal/char_parse.leo @@ -12,6 +12,7 @@ expectation: Pass '\u{2764}' '\u{306E}' '❤' +'の' '\xH2A' '\xH9' '\xO011'