[parser] Extend and update some documentation.

2024-12-24 10:41:57 +03:00 · 2023-01-13 20:12:47 -08:00 · 2023-01-13 20:12:47 -08:00 · 5c223460c1
commit 5c223460c1
parent 1cbea97f27
4 changed files with 13 additions and 5 deletions
--- a/compiler/parser/src/lib.rs
+++ b/compiler/parser/src/lib.rs
@ -43,7 +43,7 @@ pub fn parse_ast(handler: &Handler, source: &str, start_pos: BytePos) -> Result<
    Ok(Ast::new(parser::parse(handler, source, start_pos)?))
 }

-/// Parses program inputs from from the input file path and state file path
+/// Parses program inputs from the input file path
 pub fn parse_program_inputs(handler: &Handler, input_string: &str, start_pos: BytePos) -> Result<InputData> {
    let program_input: ProgramInput = parser::parse_input(handler, input_string, start_pos)?.try_into()?;

--- a/compiler/parser/src/tokenizer/lexer.rs
+++ b/compiler/parser/src/tokenizer/lexer.rs
@ -175,7 +175,7 @@ impl Token {
        Ok((int.len(), Token::Integer(int)))
    }

-    /// Returns a tuple: [(token length, token)] if the next token can be eaten, otherwise returns [`None`].
+    /// Returns a tuple: [(token length, token)] if the next token can be eaten, otherwise returns an error.
    /// The next token can be eaten if the bytes at the front of the given `input` string can be scanned into a token.
    pub(crate) fn eat(input: &str) -> Result<(usize, Token)> {
        if input.is_empty() {
@ -253,6 +253,10 @@ impl Token {
            '"' => {
                // Find end string quotation mark.
                // Instead of checking each `char` and pushing, we can avoid reallocations.
+                // This works because the code 34 of double quote cannot appear as a byte
+                // in middle of a multi-byte UTF-8 encoding of a character,
+                // because those bytes all have the high bit set to 1;
+                // in UTF-8, the byte 34 can only appear as the single-byte encoding of double quote.
                let rest = &input_str[1..];
                let string = match rest.as_bytes().iter().position(|c| *c == b'"') {
                    None => return Err(ParserError::lexer_string_not_closed(rest).into()),
@ -302,6 +306,10 @@ impl Token {
                input.next();
                if input.next_if_eq(&'/').is_some() {
                    // Find the end of the comment line.
+                    // This works because the code 10 of line feed cannot appear as a byte
+                    // in middle of a multi-byte UTF-8 encoding of a character,
+                    // because those bytes all have the high bit set to 1;
+                    // in UTF-8, the byte 10 can only appear as the single-byte encoding of line feed.
                    let comment = match input_str.as_bytes().iter().position(|c| *c == b'\n') {
                        None => input_str,
                        Some(idx) => &input_str[..idx + 1],
--- a/compiler/parser/src/tokenizer/mod.rs
+++ b/compiler/parser/src/tokenizer/mod.rs
@ -17,7 +17,7 @@
 //! The tokenizer to convert Leo code text into tokens.
 //!
 //! This module contains the [`tokenize()`] method which breaks down string text into tokens,
-//! separated by whitespace.
+//! optionally separated by whitespace.

 pub(crate) mod token;

--- a/compiler/parser/src/tokenizer/token.rs
+++ b/compiler/parser/src/tokenizer/token.rs
@ -147,9 +147,9 @@ pub enum Token {
 }

 /// Represents all valid Leo keyword tokens.
-/// This defers from the ABNF for the following reasons:
+/// This differs from the ABNF grammar for the following reasons:
 /// Adding true and false to the keywords of the ABNF grammar makes the lexical grammar ambiguous,
-/// because true and false are also boolean literals, which are different tokens from keywords
+/// because true and false are also boolean literals, which are different tokens from keywords.
 pub const KEYWORD_TOKENS: &[Token] = &[
    Token::Address,
    Token::Assert,