From 789862103e4cea8b20ac64e9da354eb81f3852a7 Mon Sep 17 00:00:00 2001 From: gluax <16431709+gluax@users.noreply.github.com> Date: Fri, 4 Mar 2022 11:18:28 -0800 Subject: [PATCH] another char bug fix --- compiler/parser/src/parser/file.rs | 3 +- compiler/parser/src/tokenizer/lexer.rs | 32 +++++++++++++++---- .../expression/literal/char_fail.leo.out | 3 +- tests/parser/expression/literal/char_fail.leo | 1 + 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/compiler/parser/src/parser/file.rs b/compiler/parser/src/parser/file.rs index 4775468001..6318e66600 100644 --- a/compiler/parser/src/parser/file.rs +++ b/compiler/parser/src/parser/file.rs @@ -438,8 +438,7 @@ impl ParserContext<'_> { let param = p.parse_function_parameters(first).map(Some); first = false; param - } - )?; + })?; // Parse return type. let output = if self.eat(Token::Arrow).is_some() { diff --git a/compiler/parser/src/tokenizer/lexer.rs b/compiler/parser/src/tokenizer/lexer.rs index 631e0f853d..ce74b35547 100644 --- a/compiler/parser/src/tokenizer/lexer.rs +++ b/compiler/parser/src/tokenizer/lexer.rs @@ -176,7 +176,7 @@ impl Token { } /// Returns the number of bytes in an emoji via a bit mask. - fn utf8_byte_count(byte: u8) -> u8 { + fn utf8_byte_count(byte: u8) -> usize { let mut mask = 0x80; let mut result = 0; while byte & mask > 0 { @@ -205,7 +205,7 @@ impl Token { x if x.is_ascii_whitespace() => return Ok((1, Token::WhiteSpace)), b'"' => { let mut i = 1; - let mut len: u8 = 1; + let mut len = 1; let mut start = 1; let mut in_escape = false; let mut escaped = false; @@ -218,7 +218,7 @@ impl Token { // If it's an emoji get the length. if input[i] & 0x80 > 0 { len = Self::utf8_byte_count(input[i]); - i += (len as usize) - 1; + i += len - 1; } if !in_escape { @@ -287,14 +287,27 @@ impl Token { let mut in_escape = false; let mut escaped = false; let mut hex = false; - let mut unicode = false; + let mut escaped_unicode = false; + let mut unicode_char = false; let mut end = false; while i < input.len() { - if !in_escape { + if input[i] & 0x80 > 0 && !unicode_char { + i += Self::utf8_byte_count(input[i]); + unicode_char = true; + continue; + } else if input[i] & 0x80 > 0 && unicode_char { + i += Self::utf8_byte_count(input[i]); + return Err(ParserError::lexer_invalid_char(&input_tendril[0..i]).into()); + } else if !in_escape || unicode_char { if input[i] == b'\'' { end = true; break; + } else if unicode_char { + return Err(ParserError::lexer_invalid_char( + &input_tendril[0..input_tendril[1..].find('\'').unwrap_or(i + 1)], + ) + .into()); } else if input[i] == b'\\' { in_escape = true; } @@ -303,7 +316,7 @@ impl Token { hex = true; } else if input[i] == b'u' { if input[i + 1] == b'{' { - unicode = true; + escaped_unicode = true; } else { return Err(ParserError::lexer_expected_valid_escaped_char(input[i]).into()); } @@ -321,7 +334,12 @@ impl Token { return Err(ParserError::lexer_char_not_closed(String::from_utf8_lossy(&input[0..i])).into()); } - let character = Self::eat_char(input_tendril.subtendril(1, (i - 1) as u32), escaped, hex, unicode)?; + let character = Self::eat_char( + input_tendril.subtendril(1, (i - 1) as u32), + escaped, + hex, + escaped_unicode, + )?; return Ok((i + 1, Token::CharLit(character))); } x if x.is_ascii_digit() => { diff --git a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out index 3378569ca7..ece3a8f208 100644 --- a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out +++ b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out @@ -38,4 +38,5 @@ outputs: - "Error [EPAR0370026]: Expected a valid escape character but found `117`." - "Error [EPAR0370039]: The escaped unicode char `110000` is greater than 0x10FFFF." - "Error [EPAR0370037]: There was no closing `}` after a escaped unicode `\\u{af🦀`." - - "Error [EPAR0370028]: Expected a closed char but found `😭😂😘`." + - "Error [EPAR0370029]: Expected valid character but found `'🦀\\`." + - "Error [EPAR0370029]: Expected valid character but found `'😭😂`." diff --git a/tests/parser/expression/literal/char_fail.leo b/tests/parser/expression/literal/char_fail.leo index c5c0b94eb3..12dfdec1f4 100644 --- a/tests/parser/expression/literal/char_fail.leo +++ b/tests/parser/expression/literal/char_fail.leo @@ -45,5 +45,6 @@ expectation: Fail '\u9999999' '\u{110000}' '\u{af🦀' +'🦀\n' '😭😂😘'