Fix interpolation in multiline strings

Fix issue #596. The main problem was a `token` annotation instead of a `regex` in the lexer. Since this bug was preventing the corresponding code path in the lexer to not be triggered, this patch also fixes a few other small issues that appeared along the way. Bump the `logos` dependency and add a regression test as well.
2024-09-19 23:47:54 +03:00 · 2022-02-04 22:23:31 +01:00 · 2022-02-04 22:23:31 +01:00 · 98bc241ea6
commit 98bc241ea6
parent ac9b9a1545
5 changed files with 38 additions and 26 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -128,9 +128,9 @@ dependencies = [

 [[package]]
 name = "beef"
-version = "0.4.4"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "474a626a67200bd107d44179bb3d4fc61891172d11696609264589be6a0e6a43"
+checksum = "bed554bd50246729a1ec158d08aa3235d1b69d94ad120ebe187e28894787e736"

 [[package]]
 name = "bit-set"
@ -788,18 +788,18 @@ dependencies = [

 [[package]]
 name = "logos"
-version = "0.11.4"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b91c49573597a5d6c094f9031617bb1fed15c0db68c81e6546d313414ce107e4"
+checksum = "427e2abca5be13136da9afdbf874e6b34ad9001dd70f2b103b083a85daa7b345"
 dependencies = [
 "logos-derive",
 ]

 [[package]]
 name = "logos-derive"
-version = "0.11.5"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "797b1f8a0571b331c1b47e7db245af3dc634838da7a92b3bef4e30376ae1c347"
+checksum = "56a7d287fd2ac3f75b11f19a1c8a874a7d55744bd91f7a1b3e7cf87d4343c36d"
 dependencies = [
 "beef",
 "fnv",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -30,7 +30,7 @@ regex = "0.2.1"
 simple-counter = "0.1.0"
 codespan = "0.11"
 codespan-reporting = "0.11"
-logos = "0.11.4"
+logos = "0.12.0"
 serde = { version = "1.0.117", features = ["derive"] }
 serde_json = "1.0.59"
 serde_yaml = "0.8.15"
--- a/src/parser/lexer.rs
+++ b/src/parser/lexer.rs
@ -333,7 +333,7 @@ pub enum MultiStringToken<'input> {
    /// The other rules should be sufficient to match this as a double quote followed by a
    /// `CandidateInterpolation`, but if we omit this token, the lexer can fail unexpectedly on
    /// valid inputs because of #200.
-    #[token("\"#+\\{")]
+    #[regex("\"#+\\{")]
    QuotesCandidateInterpolation(&'input str),
    /// Token emitted by the modal lexer for the parser once it has decided that a `CandidateEnd` is
    /// an actual end token.
@ -437,7 +437,7 @@ impl<'input> Lexer<'input> {
            //  `Normal`
            Some(ModalLexer::Normal(lexer)) => {
                self.stack.push(ModeElt::Normal(self.count));
-                self.lexer.replace(morph(lexer));
+                self.lexer = Some(morph(lexer));
            }
            _ => panic!("lexer::enter_strlike"),
        }
@ -457,11 +457,11 @@ impl<'input> Lexer<'input> {
        match self.lexer.take() {
            //count must be zero, and we do not push it on the stack
            Some(ModalLexer::Str(lexer)) => {
-                self.lexer.replace(ModalLexer::Normal(lexer.morph()));
+                self.lexer = Some(ModalLexer::Normal(lexer.morph()));
                self.stack.push(ModeElt::Str);
            }
            Some(ModalLexer::MultiStr(lexer)) => {
-                self.lexer.replace(ModalLexer::Normal(lexer.morph()));
+                self.lexer = Some(ModalLexer::Normal(lexer.morph()));
                self.stack.push(ModeElt::MultiStr(self.count));
            }
            _ => panic!("lexer::enter_normal"),
@ -479,7 +479,7 @@ impl<'input> Lexer<'input> {
                    mode => panic!("lexer::leave_str (popped mode {:?})", mode),
                };

-                self.lexer.replace(ModalLexer::Normal(lexer.morph()));
+                self.lexer = Some(ModalLexer::Normal(lexer.morph()));
            }
            _ => panic!("lexer::leave_str"),
        }
@ -494,7 +494,7 @@ impl<'input> Lexer<'input> {
                    mode => panic!("lexer::leave_str (popped mode {:?})", mode),
                };

-                self.lexer.replace(ModalLexer::Normal(lexer.morph()));
+                self.lexer = Some(ModalLexer::Normal(lexer.morph()));
            }
            _ => panic!("lexer::leave_str"),
        }
@ -505,10 +505,10 @@ impl<'input> Lexer<'input> {
            Some(ModalLexer::Normal(lexer)) => {
                // count must be 0
                match self.stack.pop() {
-                    Some(ModeElt::Str) => self.lexer.replace(ModalLexer::Str(lexer.morph())),
+                    Some(ModeElt::Str) => self.lexer = Some(ModalLexer::Str(lexer.morph())),
                    Some(ModeElt::MultiStr(count)) => {
                        self.count = count;
-                        self.lexer.replace(ModalLexer::MultiStr(lexer.morph()))
+                        self.lexer = Some(ModalLexer::MultiStr(lexer.morph()))
                    }
                    mode => panic!("lexer::leave_normal (popped mode {:?})", mode),
                };
@ -566,28 +566,36 @@ impl<'input> Iterator for Lexer<'input> {
                token = Some(MultiStr(MultiStringToken::Interpolation));
                self.enter_normal();
            }
+            // We never lex something as a `MultiStringToken::Interpolation` directly, but rather
+            // generate it in this very function from other tokens. However, such a token could
+            // have still been buffered in the previous iteration, and can thus be matched here,
+            // which is why we need the case below.
+            Some(MultiStr(MultiStringToken::Interpolation)) => self.enter_normal(),
            // If we encouter a `QuotesCandidateInterpolation` token with the right number of
            // characters, we need to split it into two tokens:
-            // - a simple `"` literal
-            // - a interpolation token
+            // - a literal starting by a `"` followed by between 0 and k hashes `#`
+            // - an interpolation token
            // The interpolation token is put in the buffer such that it will be returned next
            // time.
+            //
+            // For example, in `m##""###{exp}"##m`, the `"###{` is a `QuotesCandidateInterpolation`
+            // which is split as a `"#` literal followed by an interpolation token.
            Some(MultiStr(MultiStringToken::QuotesCandidateInterpolation(s)))
-                if s.len() == self.count =>
+                if s.len() >= self.count =>
            {
+                let split_at = s.len() - self.count + 1;
                let next_token = MultiStr(MultiStringToken::Interpolation);
                let next_span = Range {
-                    start: span.start + 1,
+                    start: span.start + split_at,
                    end: span.end,
                };
-                self.buffer.replace((next_token, next_span));
+                self.buffer = Some((next_token, next_span));

-                token = Some(MultiStr(MultiStringToken::Literal(&s[0..1])));
+                token = Some(MultiStr(MultiStringToken::Literal(&s[0..split_at])));
                span = Range {
                    start: span.start,
-                    end: span.start + 1,
+                    end: span.start + split_at,
                };
-                self.enter_normal();
            }
            // Otherwise, it is just part of the string, so we transform the token into a
            // `FalseInterpolation` one
@ -637,7 +645,7 @@ impl<'input> Iterator for Lexer<'input> {
            // Ignore comment
            Some(Normal(NormalToken::LineComment)) => return self.next(),
            _ => (),
-        }
+        };

        token.map(|t| Ok((span.start, t, span.end)))
    }
--- a/src/parser/tests.rs
+++ b/src/parser/tests.rs
@ -279,8 +279,7 @@ fn string_lexing() {
        lex_without_pos(r##"m#""#"#m"##),
        Ok(vec![
            Token::Normal(NormalToken::MultiStringStart(3)),
-            Token::MultiStr(MultiStringToken::Literal("\"")),
-            Token::MultiStr(MultiStringToken::Literal("#")),
+            Token::MultiStr(MultiStringToken::Literal("\"#")),
            Token::MultiStr(MultiStringToken::End),
        ])
    );
--- a/tests/pass/strings.ncl
+++ b/tests/pass/strings.ncl
@ -15,5 +15,10 @@ let Assert = fun l x => x || %blame% l in
  m#""#{"foo"}""#m == "\"foo\"",
  m#"""#m == "\"",
  m#""#"#"#"#m == "\"#\"#\"#",
+
+  // regression test for issue #596 (https://github.com/tweag/nickel/issues/596)
+  let s = "Hello" in m##""##{s}" World"##m == "\"Hello\" World",
+  let s = "Hello" in m##""###{s}" World"##m == "\"#Hello\" World",
+  m##""##s"##m == "\"##s",
 ]
 |> lists.foldl (fun x y => (x | #Assert) && y) true