From 789862103e4cea8b20ac64e9da354eb81f3852a7 Mon Sep 17 00:00:00 2001
From: gluax <16431709+gluax@users.noreply.github.com>
Date: Fri, 4 Mar 2022 11:18:28 -0800
Subject: [PATCH] another char bug fix

---
 compiler/parser/src/parser/file.rs            |  3 +-
 compiler/parser/src/tokenizer/lexer.rs        | 32 +++++++++++++++----
 .../expression/literal/char_fail.leo.out      |  3 +-
 tests/parser/expression/literal/char_fail.leo |  1 +
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/compiler/parser/src/parser/file.rs b/compiler/parser/src/parser/file.rs
index 4775468001..6318e66600 100644
--- a/compiler/parser/src/parser/file.rs
+++ b/compiler/parser/src/parser/file.rs
@@ -438,8 +438,7 @@ impl ParserContext<'_> {
             let param = p.parse_function_parameters(first).map(Some);
             first = false;
             param
-        }
-        )?;
+        })?;
 
         // Parse return type.
         let output = if self.eat(Token::Arrow).is_some() {
diff --git a/compiler/parser/src/tokenizer/lexer.rs b/compiler/parser/src/tokenizer/lexer.rs
index 631e0f853d..ce74b35547 100644
--- a/compiler/parser/src/tokenizer/lexer.rs
+++ b/compiler/parser/src/tokenizer/lexer.rs
@@ -176,7 +176,7 @@ impl Token {
     }
 
     /// Returns the number of bytes in an emoji via a bit mask.
-    fn utf8_byte_count(byte: u8) -> u8 {
+    fn utf8_byte_count(byte: u8) -> usize {
         let mut mask = 0x80;
         let mut result = 0;
         while byte & mask > 0 {
@@ -205,7 +205,7 @@ impl Token {
             x if x.is_ascii_whitespace() => return Ok((1, Token::WhiteSpace)),
             b'"' => {
                 let mut i = 1;
-                let mut len: u8 = 1;
+                let mut len = 1;
                 let mut start = 1;
                 let mut in_escape = false;
                 let mut escaped = false;
@@ -218,7 +218,7 @@ impl Token {
                     // If it's an emoji get the length.
                     if input[i] & 0x80 > 0 {
                         len = Self::utf8_byte_count(input[i]);
-                        i += (len as usize) - 1;
+                        i += len - 1;
                     }
 
                     if !in_escape {
@@ -287,14 +287,27 @@ impl Token {
                 let mut in_escape = false;
                 let mut escaped = false;
                 let mut hex = false;
-                let mut unicode = false;
+                let mut escaped_unicode = false;
+                let mut unicode_char = false;
                 let mut end = false;
 
                 while i < input.len() {
-                    if !in_escape {
+                    if input[i] & 0x80 > 0 && !unicode_char {
+                        i += Self::utf8_byte_count(input[i]);
+                        unicode_char = true;
+                        continue;
+                    } else if input[i] & 0x80 > 0 && unicode_char {
+                        i += Self::utf8_byte_count(input[i]);
+                        return Err(ParserError::lexer_invalid_char(&input_tendril[0..i]).into());
+                    } else if !in_escape || unicode_char {
                         if input[i] == b'\'' {
                             end = true;
                             break;
+                        } else if unicode_char {
+                            return Err(ParserError::lexer_invalid_char(
+                                &input_tendril[0..input_tendril[1..].find('\'').unwrap_or(i + 1)],
+                            )
+                            .into());
                         } else if input[i] == b'\\' {
                             in_escape = true;
                         }
@@ -303,7 +316,7 @@ impl Token {
                             hex = true;
                         } else if input[i] == b'u' {
                             if input[i + 1] == b'{' {
-                                unicode = true;
+                                escaped_unicode = true;
                             } else {
                                 return Err(ParserError::lexer_expected_valid_escaped_char(input[i]).into());
                             }
@@ -321,7 +334,12 @@ impl Token {
                     return Err(ParserError::lexer_char_not_closed(String::from_utf8_lossy(&input[0..i])).into());
                 }
 
-                let character = Self::eat_char(input_tendril.subtendril(1, (i - 1) as u32), escaped, hex, unicode)?;
+                let character = Self::eat_char(
+                    input_tendril.subtendril(1, (i - 1) as u32),
+                    escaped,
+                    hex,
+                    escaped_unicode,
+                )?;
                 return Ok((i + 1, Token::CharLit(character)));
             }
             x if x.is_ascii_digit() => {
diff --git a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out
index 3378569ca7..ece3a8f208 100644
--- a/tests/expectations/parser/parser/expression/literal/char_fail.leo.out
+++ b/tests/expectations/parser/parser/expression/literal/char_fail.leo.out
@@ -38,4 +38,5 @@ outputs:
   - "Error [EPAR0370026]: Expected a valid escape character but found `117`."
   - "Error [EPAR0370039]: The escaped unicode char `110000` is greater than 0x10FFFF."
   - "Error [EPAR0370037]: There was no closing `}` after a escaped unicode `\\u{af🦀`."
-  - "Error [EPAR0370028]: Expected a closed char but found `😭😂😘`."
+  - "Error [EPAR0370029]: Expected valid character but found `'🦀\\`."
+  - "Error [EPAR0370029]: Expected valid character but found `'😭😂`."
diff --git a/tests/parser/expression/literal/char_fail.leo b/tests/parser/expression/literal/char_fail.leo
index c5c0b94eb3..12dfdec1f4 100644
--- a/tests/parser/expression/literal/char_fail.leo
+++ b/tests/parser/expression/literal/char_fail.leo
@@ -45,5 +45,6 @@ expectation: Fail
 '\u9999999'
 '\u{110000}'
 '\u{af🦀'
+'🦀\n'
 
 '😭😂😘'