chars done, should merge in compiler tests

This commit is contained in:
gluax 2021-05-13 17:25:33 -04:00
parent 53cd6e2a12
commit 733bc90bbe
10 changed files with 317 additions and 197 deletions

View File

@ -1,4 +1,4 @@
function main() { function main() {
const heart: char = '\u{2764}'; const heart: char = '';
const Hiragana = '\u{306E}'; const Hiragana = '';
} }

View File

@ -42,6 +42,16 @@ fn test_basic() {
assert_satisfied(program); assert_satisfied(program);
} }
#[test]
fn test_circuit() {
let program_string = include_str!("circuit.leo");
let char_input_string = include_str!("input/char.in");
let program = parse_program_with_input(program_string, char_input_string).unwrap();
assert_satisfied(program);
}
#[test] #[test]
fn test_escapes() { fn test_escapes() {
let program_string = include_str!("escapes.leo"); let program_string = include_str!("escapes.leo");
@ -67,11 +77,17 @@ fn test_function() {
} }
#[test] #[test]
fn test_circuit() { fn test_octal() {
let program_string = include_str!("circuit.leo"); let program_string = include_str!("octal.leo");
let char_input_string = include_str!("input/char.in"); let program = parse_program(program_string).unwrap();
let program = parse_program_with_input(program_string, char_input_string).unwrap(); assert_satisfied(program);
}
#[test]
fn test_unicode() {
let program_string = include_str!("unicode.leo");
let program = parse_program(program_string).unwrap();
assert_satisfied(program); assert_satisfied(program);
} }

View File

@ -0,0 +1,4 @@
function main() {
const tab: char = '\xO011';
const z = '\xO172';
}

View File

@ -0,0 +1,4 @@
function main() {
const heart: char = '\u{2764}';
const Hiragana = '\u{306E}';
}

View File

@ -137,10 +137,15 @@ number_positive = @{ ASCII_DIGIT+ }
// ANY is equivalent to '\u{00}'..'\u{10FFFF}' // ANY is equivalent to '\u{00}'..'\u{10FFFF}'
basic_char = { ANY } basic_char = { ANY }
escaped_char = @{ "\\" ~ ("\"" | "\'" | "\\" | "/" | "b" | "f" | "n" | "r" | "t") } escaped_char = @{ "\\" ~ ("\"" | "\'" | "\\" | "/" | "b" | "f" | "n" | "r" | "t") }
hex_char = @{ "\\" ~ "u" ~ "{" ~ ASCII_HEX_DIGIT{4} ~ "}" } hex_char = @{ "\\" ~ "x" ~ "H" ~ ASCII_HEX_DIGIT{1, 2} }
octal_char = @{ "\\" ~ "x" ~ "O" ~ ASCII_DIGIT{3} }
unicode_char = @{ "\\" ~ "u" ~ "{" ~ ASCII_HEX_DIGIT{1, 6} ~ "}" }
char_types = { char_types = {
escaped_char escaped_char
| unicode_char
| hex_char | hex_char
| octal_char
| basic_char | basic_char
} }

View File

@ -49,12 +49,32 @@ pub struct HexChar<'ast> {
pub span: Span<'ast>, pub span: Span<'ast>,
} }
#[derive(Clone, Debug, FromPest, PartialEq, Eq)]
#[pest_ast(rule(Rule::octal_char))]
pub struct OctalChar<'ast> {
#[pest_ast(outer(with(span_into_string)))]
pub value: String,
#[pest_ast(outer())]
pub span: Span<'ast>,
}
#[derive(Clone, Debug, FromPest, PartialEq, Eq)]
#[pest_ast(rule(Rule::unicode_char))]
pub struct UnicodeChar<'ast> {
#[pest_ast(outer(with(span_into_string)))]
pub value: String,
#[pest_ast(outer())]
pub span: Span<'ast>,
}
#[derive(Clone, Debug, FromPest, PartialEq, Eq)] #[derive(Clone, Debug, FromPest, PartialEq, Eq)]
#[pest_ast(rule(Rule::char_types))] #[pest_ast(rule(Rule::char_types))]
pub enum CharTypes<'ast> { pub enum CharTypes<'ast> {
Basic(BasicChar<'ast>), Basic(BasicChar<'ast>),
Escaped(EscapedChar<'ast>), Escaped(EscapedChar<'ast>),
Hex(HexChar<'ast>), Hex(HexChar<'ast>),
Octal(OctalChar<'ast>),
Unicode(UnicodeChar<'ast>),
} }
impl<'ast> CharTypes<'ast> { impl<'ast> CharTypes<'ast> {
@ -75,8 +95,28 @@ impl<'ast> CharTypes<'ast> {
Err(InputParserError::invalid_char(character.value, &character.span)) Err(InputParserError::invalid_char(character.value, &character.span))
} }
Self::Hex(character) => { Self::Hex(character) => {
let hex_string_number = character.value[3..=6].to_string(); let hex_string_number = character.value[3..character.value.len()].to_string();
if let Ok(hex) = u32::from_str_radix(&hex_string_number, 16) { if let Ok(number) = u8::from_str_radix(&hex_string_number, 16) {
if number < 127 {
return Ok(number as char);
}
}
Err(InputParserError::invalid_char(character.value, &character.span))
}
Self::Octal(character) => {
let octal_string_number = character.value[3..character.value.len()].to_string();
if let Ok(number) = u8::from_str_radix(&octal_string_number, 8) {
if number < 127 {
return Ok(number as char);
}
}
Err(InputParserError::invalid_char(character.value, &character.span))
}
Self::Unicode(character) => {
let unicode_string_number = character.value[3..=character.value.len() - 2].to_string();
if let Ok(hex) = u32::from_str_radix(&unicode_string_number, 16) {
if let Some(unicode) = std::char::from_u32(hex) { if let Some(unicode) = std::char::from_u32(hex) {
return Ok(unicode); return Ok(unicode);
} }

View File

@ -61,6 +61,162 @@ fn eat_identifier(input_tendril: &StrTendril) -> Option<StrTendril> {
} }
impl Token { impl Token {
///
/// Returns a new `StrTendril` string if an character can be eaten, otherwise returns [`None`].
///
fn eat_char(input_tendril: &StrTendril) -> (usize, Option<Token>) {
if input_tendril.is_empty() {
return (0, None);
}
let input = input_tendril[..].as_bytes();
let mut i = 1;
let mut escaped = false;
let mut hex = false;
let mut octal = false;
let mut unicode = false;
let mut last = false;
let mut characters: Vec<u8> = vec![];
while i < input.len() {
if !escaped {
if input[i] == b'\'' {
last = true;
i += 1;
break;
}
if input[i] == b'{' {
i += 1;
characters.clear();
continue;
}
if input[i] == b'}' {
i += 1;
continue;
}
} else {
escaped = false;
characters.clear();
match input[i] {
b'0' => characters.push(0),
b't' => characters.push(9),
b'n' => characters.push(10),
b'r' => characters.push(13),
b'\"' => characters.push(34),
b'\'' => characters.push(39),
b'\\' => characters.push(92),
b'x' => {
i += 1;
match input[i] {
b'H' => {
hex = true;
}
b'O' => {
octal = true;
}
_ => {
return (0, None);
}
}
i += 1;
continue;
}
b'u' => {
unicode = true;
}
_ => {
return (0, None);
}
}
i += 1;
continue;
}
if input[i] == b'\\' {
escaped = true;
}
characters.push(input[i]);
i += 1;
}
if !last {
return (0, None);
}
return match characters.len() {
1 => {
if hex {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(number) = u8::from_str_radix(&string, 16) {
if number < 127 {
return (i, Some(Token::CharLit(number as char)));
}
}
}
}
(i, Some(Token::CharLit(characters[0] as char)))
}
2 => {
if hex {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(number) = u8::from_str_radix(&string, 16) {
if number < 127 {
return (i, Some(Token::CharLit(number as char)));
}
}
}
}
if unicode {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Some(character) = string.chars().next() {
return (i, Some(Token::CharLit(character)));
}
}
}
(0, None)
}
3 => {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if octal {
if let Ok(number) = u8::from_str_radix(&string, 8) {
if number < 127 {
return (i, Some(Token::CharLit(number as char)));
}
}
}
if let Some(character) = string.chars().next() {
return (i, Some(Token::CharLit(character)));
}
}
(0, None)
}
4 | 5 | 6 => {
if let Ok(unicode_string) = std::str::from_utf8(&characters[..]) {
if let Ok(hex) = u32::from_str_radix(&unicode_string, 16) {
if let Some(unicode_char) = std::char::from_u32(hex) {
return (i, Some(Token::CharLit(unicode_char)));
}
}
}
(0, None)
}
_ => (0, None),
};
}
/// ///
/// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`]. /// Returns a tuple: [(integer length, integer token)] if an integer can be eaten, otherwise returns [`None`].
/// An integer can be eaten if its bytes are at the front of the given `input_tendril` string. /// An integer can be eaten if its bytes are at the front of the given `input_tendril` string.
@ -151,184 +307,7 @@ impl Token {
return (i + 1, Some(Token::FormatString(segments))); return (i + 1, Some(Token::FormatString(segments)));
} }
b'\'' => { b'\'' => {
let mut i = 1; return Self::eat_char(&input_tendril);
let mut escaped = false;
let mut hex = false;
let mut octal = false;
let mut unicode = true;
let mut characters: Vec<u8> = vec![];
while i < input.len() {
if !escaped {
if input[i] == b'\'' {
i += 1;
break;
}
if input[i] == b'{' {
i += 1;
characters.clear();
continue;
}
if input[i] == b'}' {
i += 1;
continue;
}
} else {
escaped = false;
characters.clear();
match input[i] {
b'0' => characters.push(0),
b't' => characters.push(9),
b'n' => characters.push(10),
b'r' => characters.push(13),
b'\"' => characters.push(34),
b'\'' => characters.push(39),
b'\\' => characters.push(92),
b'x' => {
i += 1;
match input[i] {
b'H' => {
hex = true;
}
b'O' => {
octal = true;
}
_ => {
return (0, None);
}
}
i += 1;
continue;
}
b'u' => {
unicode = true;
}
_ => {
return (0, None);
}
}
i += 1;
continue;
}
if input[i] == b'\\' {
escaped = true;
}
characters.push(input[i]);
i += 1;
}
if i == input.len() {
return (0, None);
}
return match characters.len() {
1 => {
if hex {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(number) = u8::from_str_radix(&string, 16) {
if number < 127 {
return (i, Some(Token::CharLit(number as char)));
}
}
}
}
(i, Some(Token::CharLit(characters[0] as char)))
}
2 => {
if hex {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Ok(number) = u8::from_str_radix(&string, 16) {
if number < 127 {
return (i, Some(Token::CharLit(number as char)));
}
}
}
}
if unicode {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if let Some(character) = string.chars().next() {
return (i, Some(Token::CharLit(character)));
}
}
}
(0, None)
}
3 => {
if let Ok(string) = std::str::from_utf8(&characters[..]) {
if octal {
if let Ok(number) = u8::from_str_radix(&string, 8) {
if number < 127 {
return (i, Some(Token::CharLit(number as char)));
}
}
}
if let Some(character) = string.chars().next() {
return (i, Some(Token::CharLit(character)));
}
}
(0, None)
}
4 | 5 | 6 => {
if let Ok(unicode_string) = std::str::from_utf8(&characters[..]) {
if let Ok(hex) = u32::from_str_radix(&unicode_string, 16) {
if let Some(unicode_char) = std::char::from_u32(hex) {
return (i, Some(Token::CharLit(unicode_char)));
}
}
}
(0, None)
}
_ => (0, None),
};
// while i < input.len() {
// if !in_escape {
// if input[i] == b'\'' {
// break;
// }
// if input[i] == b'\\' {
// in_escape = !in_escape;
// } else {
// character.push(input[i] as char);
// }
// } else {
// in_escape = false;
// if input[i] == b'u' {
// i += 2;
// let mut j = i;
// let mut size = 0;
// while input[j] != b'}' {
// j += 1;
// size += 1;
// }
// let hex_string_number: String = input_tendril.subtendril(i as u32, size).to_string();
// if let Ok(hex) = u32::from_str_radix(&hex_string_number, 16) {
// if let Some(unicode) = std::char::from_u32(hex) {
// i = j;
// character = unicode.to_string();
// }
// } else {
// return (0, None);
// }
// } else {
// character.push(input[i] as char);
// }
// }
// i += 1;
// }
} }
x if x.is_ascii_digit() => { x if x.is_ascii_digit() => {
return Self::eat_integer(&input_tendril); return Self::eat_integer(&input_tendril);

View File

@ -3,8 +3,16 @@ namespace: Token
expectation: Pass expectation: Pass
outputs: outputs:
- "'a' @ 1:1-4" - "'a' @ 1:1-4"
- "'A' @ 1:1-4" - "'Z' @ 1:1-4"
- "''' @ 1:1-5" - "'\"' @ 1:1-5"
- "'\\' @ 1:1-5" - "'' @ 1:1-5"
- "'n' @ 1:1-5" - "'' @ 1:1-5"
- "'\u0000' @ 1:1-5"
- "'❤' @ 1:1-11" - "'❤' @ 1:1-11"
- "'の' @ 1:1-11"
- "'❤' @ 1:1-6"
- "'の' @ 1:1-6"
- "'*' @ 1:1-8"
- "'' @ 1:1-7"
- "'' @ 1:1-9"
- "'z' @ 1:1-9"

View File

@ -13,13 +13,13 @@ outputs:
content: "'a'" content: "'a'"
- Value: - Value:
Char: Char:
- b - Z
- line_start: 1 - line_start: 1
line_stop: 1 line_stop: 1
col_start: 1 col_start: 1
col_stop: 4 col_stop: 4
path: test path: test
content: "'b'" content: "'Z'"
- Value: - Value:
Char: Char:
- "\"" - "\""
@ -31,7 +31,7 @@ outputs:
content: "'\\\"'" content: "'\\\"'"
- Value: - Value:
Char: Char:
- t - "\t"
- line_start: 1 - line_start: 1
line_stop: 1 line_stop: 1
col_start: 1 col_start: 1
@ -40,7 +40,7 @@ outputs:
content: "'\\t'" content: "'\\t'"
- Value: - Value:
Char: Char:
- r - "\r"
- line_start: 1 - line_start: 1
line_stop: 1 line_stop: 1
col_start: 1 col_start: 1
@ -49,7 +49,7 @@ outputs:
content: "'\\r'" content: "'\\r'"
- Value: - Value:
Char: Char:
- "0" - "\u0000"
- line_start: 1 - line_start: 1
line_stop: 1 line_stop: 1
col_start: 1 col_start: 1
@ -65,3 +65,66 @@ outputs:
col_stop: 11 col_stop: 11
path: test path: test
content: "'\\u{2764}'" content: "'\\u{2764}'"
- Value:
Char:
- の
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 11
path: test
content: "'\\u{306E}'"
- Value:
Char:
- ❤
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 6
path: test
content: "'❤'"
- Value:
Char:
- の
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 6
path: test
content: "'の'"
- Value:
Char:
- "*"
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 8
path: test
content: "'\\xH2A'"
- Value:
Char:
- "\t"
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 7
path: test
content: "'\\xH9'"
- Value:
Char:
- "\t"
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 9
path: test
content: "'\\xO011'"
- Value:
Char:
- z
- line_start: 1
line_stop: 1
col_start: 1
col_stop: 9
path: test
content: "'\\xO172'"

View File

@ -12,6 +12,7 @@ expectation: Pass
'\u{2764}' '\u{2764}'
'\u{306E}' '\u{306E}'
'❤' '❤'
'の'
'\xH2A' '\xH2A'
'\xH9' '\xH9'
'\xO011' '\xO011'