diff --git a/compiler/parser/src/lib.rs b/compiler/parser/src/lib.rs index 5593fad9e4..e5651133f4 100644 --- a/compiler/parser/src/lib.rs +++ b/compiler/parser/src/lib.rs @@ -43,7 +43,7 @@ pub fn parse_ast(handler: &Handler, source: &str, start_pos: BytePos) -> Result< Ok(Ast::new(parser::parse(handler, source, start_pos)?)) } -/// Parses program inputs from from the input file path and state file path +/// Parses program inputs from the input file path pub fn parse_program_inputs(handler: &Handler, input_string: &str, start_pos: BytePos) -> Result { let program_input: ProgramInput = parser::parse_input(handler, input_string, start_pos)?.try_into()?; diff --git a/compiler/parser/src/tokenizer/lexer.rs b/compiler/parser/src/tokenizer/lexer.rs index e62b5c793a..639544a58f 100644 --- a/compiler/parser/src/tokenizer/lexer.rs +++ b/compiler/parser/src/tokenizer/lexer.rs @@ -175,7 +175,7 @@ impl Token { Ok((int.len(), Token::Integer(int))) } - /// Returns a tuple: [(token length, token)] if the next token can be eaten, otherwise returns [`None`]. + /// Returns a tuple: [(token length, token)] if the next token can be eaten, otherwise returns an error. /// The next token can be eaten if the bytes at the front of the given `input` string can be scanned into a token. pub(crate) fn eat(input: &str) -> Result<(usize, Token)> { if input.is_empty() { @@ -253,6 +253,10 @@ impl Token { '"' => { // Find end string quotation mark. // Instead of checking each `char` and pushing, we can avoid reallocations. + // This works because the code 34 of double quote cannot appear as a byte + // in middle of a multi-byte UTF-8 encoding of a character, + // because those bytes all have the high bit set to 1; + // in UTF-8, the byte 34 can only appear as the single-byte encoding of double quote. let rest = &input_str[1..]; let string = match rest.as_bytes().iter().position(|c| *c == b'"') { None => return Err(ParserError::lexer_string_not_closed(rest).into()), @@ -302,6 +306,10 @@ impl Token { input.next(); if input.next_if_eq(&'/').is_some() { // Find the end of the comment line. + // This works because the code 10 of line feed cannot appear as a byte + // in middle of a multi-byte UTF-8 encoding of a character, + // because those bytes all have the high bit set to 1; + // in UTF-8, the byte 10 can only appear as the single-byte encoding of line feed. let comment = match input_str.as_bytes().iter().position(|c| *c == b'\n') { None => input_str, Some(idx) => &input_str[..idx + 1], diff --git a/compiler/parser/src/tokenizer/mod.rs b/compiler/parser/src/tokenizer/mod.rs index 481176d475..2d7a7f7961 100644 --- a/compiler/parser/src/tokenizer/mod.rs +++ b/compiler/parser/src/tokenizer/mod.rs @@ -17,7 +17,7 @@ //! The tokenizer to convert Leo code text into tokens. //! //! This module contains the [`tokenize()`] method which breaks down string text into tokens, -//! separated by whitespace. +//! optionally separated by whitespace. pub(crate) mod token; diff --git a/compiler/parser/src/tokenizer/token.rs b/compiler/parser/src/tokenizer/token.rs index bed70e3ca4..2db41bde09 100644 --- a/compiler/parser/src/tokenizer/token.rs +++ b/compiler/parser/src/tokenizer/token.rs @@ -147,9 +147,9 @@ pub enum Token { } /// Represents all valid Leo keyword tokens. -/// This defers from the ABNF for the following reasons: +/// This differs from the ABNF grammar for the following reasons: /// Adding true and false to the keywords of the ABNF grammar makes the lexical grammar ambiguous, -/// because true and false are also boolean literals, which are different tokens from keywords +/// because true and false are also boolean literals, which are different tokens from keywords. pub const KEYWORD_TOKENS: &[Token] = &[ Token::Address, Token::Assert,