Merge pull request #3635 from rtfeldman/joshuawarner32/multiline

Strip indents and the first/last newline from multiline strings
2024-11-10 10:02:38 +03:00 · 2022-07-26 22:09:58 -04:00 · 2022-07-26 22:09:58 -04:00 · fa5bd442f8
commit fa5bd442f8
parent 923341ec95 a579edc6f0
8 changed files with 284 additions and 44 deletions
--- a/crates/compiler/parse/src/parser.rs
+++ b/crates/compiler/parse/src/parser.rs
@ -384,6 +384,7 @@ pub enum EString<'a> {
    UnknownEscape(Position),
    Format(&'a EExpr<'a>, Position),
    FormatEnd(Position),
+    MultilineInsufficientIndent(Position),
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
--- a/crates/compiler/parse/src/state.rs
+++ b/crates/compiler/parse/src/state.rs
@ -37,6 +37,25 @@ impl<'a> State<'a> {
        self.pos().offset - self.line_start.offset
    }

+    /// Mutably advance the state by a given offset
+    #[inline(always)]
+    pub(crate) fn advance_mut(&mut self, offset: usize) {
+        self.offset += offset;
+    }
+
+    /// If the next `text.len()` bytes of the input match the provided `text`,
+    /// mutably advance the state by that much.
+    #[inline(always)]
+    pub(crate) fn consume_mut(&mut self, text: &str) -> bool {
+        let found = self.bytes().starts_with(text.as_bytes());
+
+        if found {
+            self.advance_mut(text.len());
+        }
+
+        found
+    }
+
    #[must_use]
    #[inline(always)]
    pub(crate) const fn advance(mut self, offset: usize) -> State<'a> {
--- a/crates/compiler/parse/src/string_literal.rs
+++ b/crates/compiler/parse/src/string_literal.rs
@ -1,6 +1,6 @@
 use crate::ast::{EscapedChar, StrLiteral, StrSegment};
 use crate::expr;
-use crate::parser::Progress::*;
+use crate::parser::Progress::{self, *};
 use crate::parser::{allocated, loc, specialize_ref, word1, BadInputError, EString, Parser};
 use crate::state::State;
 use bumpalo::collections::vec::Vec;
@ -9,7 +9,7 @@ use bumpalo::Bump;
 /// One or more ASCII hex digits. (Useful when parsing unicode escape codes,
 /// which must consist entirely of ASCII hex digits.)
 fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> {
-    move |arena, state: State<'a>| {
+    move |arena, mut state: State<'a>| {
        let mut buf = bumpalo::collections::String::new_in(arena);

        for &byte in state.bytes().iter() {
@ -19,7 +19,7 @@ fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> {
                // We didn't find any hex digits!
                return Err((NoProgress, EString::CodePtEnd(state.pos()), state));
            } else {
-                let state = state.advance(buf.len());
+                state.advance_mut(buf.len());

                return Ok((MadeProgress, buf.into_bump_str(), state));
            }
@ -29,36 +29,27 @@ fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> {
    }
 }

-macro_rules! advance_state {
-    ($state:expr, $n:expr) => {
-        Ok($state.advance($n))
-    };
-}
-
 pub fn parse_single_quote<'a>() -> impl Parser<'a, &'a str, EString<'a>> {
    move |arena: &'a Bump, mut state: State<'a>| {
-        if state.bytes().starts_with(b"\'") {
+        if state.consume_mut("\'") {
            // we will be parsing a single-quote-string
        } else {
            return Err((NoProgress, EString::Open(state.pos()), state));
        }

-        // early return did not hit, just advance one byte
-        state = advance_state!(state, 1)?;
-
        // Handle back slaches in byte literal
        // - starts with a backslash and used as an escape character. ex: '\n', '\t'
        // - single quote floating (un closed single quote) should be an error
        match state.bytes().first() {
            Some(b'\\') => {
-                state = advance_state!(state, 1)?;
+                state.advance_mut(1);
                match state.bytes().first() {
                    Some(&ch) => {
-                        state = advance_state!(state, 1)?;
+                        state.advance_mut(1);
                        if (ch == b'n' || ch == b'r' || ch == b't' || ch == b'\'' || ch == b'\\')
                            && (state.bytes().first() == Some(&b'\''))
                        {
-                            state = advance_state!(state, 1)?;
+                            state.advance_mut(1);
                            let test = match ch {
                                b'n' => '\n',
                                b't' => '\t',
@ -118,7 +109,7 @@ pub fn parse_single_quote<'a>() -> impl Parser<'a, &'a str, EString<'a>> {
        // ending up w/ a slice of bytes that we want to convert into an integer
        let raw_bytes = &state.bytes()[0..end_index - 1];

-        state = advance_state!(state, end_index)?;
+        state.advance_mut(end_index);
        match std::str::from_utf8(raw_bytes) {
            Ok(string) => Ok((MadeProgress, string, state)),
            Err(_) => {
@ -129,33 +120,78 @@ pub fn parse_single_quote<'a>() -> impl Parser<'a, &'a str, EString<'a>> {
    }
 }

+fn consume_indent<'a>(
+    mut state: State<'a>,
+    mut indent: u32,
+) -> Result<State, (Progress, EString<'a>, State<'a>)> {
+    while indent > 0 {
+        match state.bytes().first() {
+            Some(b' ') => {
+                state.advance_mut(1);
+                indent -= 1;
+            }
+            None | Some(b'\n') => {
+                break;
+            }
+            Some(_) => {
+                return Err((
+                    MadeProgress,
+                    EString::MultilineInsufficientIndent(state.pos()),
+                    state,
+                ));
+            }
+        }
+    }
+
+    Ok(state)
+}
+
+fn utf8<'a>(
+    state: State<'a>,
+    string_bytes: &'a [u8],
+) -> Result<&'a str, (Progress, EString<'a>, State<'a>)> {
+    std::str::from_utf8(string_bytes).map_err(|_| {
+        // Note Based on where this `utf8` function is used, the fact that we know the whole string
+        // in the parser is valid utf8, and barring bugs in the parser itself
+        // (e.g. where we accidentally split a multibyte utf8 char), this error _should_ actually be unreachable.
+        (
+            MadeProgress,
+            EString::Space(BadInputError::BadUtf8, state.pos()),
+            state,
+        )
+    })
+}
+
 pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
    use StrLiteral::*;

    move |arena: &'a Bump, mut state: State<'a>| {
        let is_multiline;
-        let mut bytes;

-        if state.bytes().starts_with(b"\"\"\"") {
-            // we will be parsing a multi-string
+        let indent = state.column();
+
+        let start_state;
+
+        if state.consume_mut("\"\"\"") {
+            start_state = state.clone();
+
+            // we will be parsing a multi-line string
            is_multiline = true;
-            bytes = state.bytes()[3..].iter();
-            state = advance_state!(state, 3)?;
-        } else if state.bytes().starts_with(b"\"") {
-            // we will be parsing a single-string
+
+            if state.consume_mut("\n") {
+                state = consume_indent(state, indent)?;
+            }
+        } else if state.consume_mut("\"") {
+            start_state = state.clone();
+
+            // we will be parsing a single-line string
            is_multiline = false;
-            bytes = state.bytes()[1..].iter();
-            state = advance_state!(state, 1)?;
        } else {
            return Err((NoProgress, EString::Open(state.pos()), state));
        }

-        // At the parsing stage we keep the entire raw string, because the formatter
-        // needs the raw string. (For example, so it can "remember" whether you
-        // wrote \u{...} or the actual unicode character itself.)
-        //
-        // Since we're keeping the entire raw string, all we need to track is
-        // how many characters we've parsed. So far, that's 1 (the opening `"`).
+        let mut bytes = state.bytes().iter();
+
        let mut segment_parsed_bytes = 0;
        let mut segments = Vec::new_in(arena);

@ -165,7 +201,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                segments.push(StrSegment::EscapedChar($ch));

                // Advance past the segment we just added
-                state = advance_state!(state, segment_parsed_bytes)?;
+                state.advance_mut(segment_parsed_bytes);

                // Reset the segment
                segment_parsed_bytes = 0;
@ -184,7 +220,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {

                    match std::str::from_utf8(string_bytes) {
                        Ok(string) => {
-                            state = advance_state!(state, string.len())?;
+                            state.advance_mut(string.len());

                            segments.push($transform(string));
                        }
@ -220,7 +256,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                        // special case of the empty string
                        if is_multiline {
                            if bytes.as_slice().starts_with(b"\"\"") {
-                                return Ok((MadeProgress, Block(&[]), advance_state!(state, 3)?));
+                                return Ok((MadeProgress, Block(&[]), state.advance(3)));
                            } else {
                                // this quote is in a block string
                                continue;
@ -228,7 +264,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                        } else {
                            // This is the end of the string!
                            // Advance 1 for the close quote
-                            return Ok((MadeProgress, PlainLine(""), advance_state!(state, 1)?));
+                            return Ok((MadeProgress, PlainLine(""), state.advance(1)));
                        }
                    } else {
                        // the string is non-empty, which means we need to convert any previous segments
@ -250,7 +286,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                                    Block(arena.alloc([segments.into_bump_slice()]))
                                };

-                                return Ok((MadeProgress, expr, advance_state!(state, 3)?));
+                                return Ok((MadeProgress, expr, state.advance(3)));
                            } else {
                                // this quote is in a block string
                                continue;
@ -270,12 +306,30 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                            };

                            // Advance the state 1 to account for the closing `"`
-                            return Ok((MadeProgress, expr, advance_state!(state, 1)?));
+                            return Ok((MadeProgress, expr, state.advance(1)));
                        }
                    };
                }
                b'\n' => {
                    if is_multiline {
+                        let without_newline = &state.bytes()[0..(segment_parsed_bytes - 1)];
+                        let with_newline = &state.bytes()[0..segment_parsed_bytes];
+
+                        state.advance_mut(segment_parsed_bytes);
+                        state = consume_indent(state, indent)?;
+                        bytes = state.bytes().iter();
+
+                        if state.bytes().starts_with(b"\"\"\"") {
+                            // ending the string; don't use the last newline
+                            segments
+                                .push(StrSegment::Plaintext(utf8(state.clone(), without_newline)?));
+                        } else {
+                            segments
+                                .push(StrSegment::Plaintext(utf8(state.clone(), with_newline)?));
+                        }
+
+                        segment_parsed_bytes = 0;
+
                        continue;
                    } else {
                        // This is a single-line string, which cannot have newlines!
@ -283,7 +337,11 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                        // all remaining chars. This will mask all other errors, but
                        // it should make it easiest to debug; the file will be a giant
                        // error starting from where the open quote appeared.
-                        return Err((MadeProgress, EString::EndlessSingle(state.pos()), state));
+                        return Err((
+                            MadeProgress,
+                            EString::EndlessSingle(start_state.pos()),
+                            start_state,
+                        ));
                    }
                }
                b'\\' => {
@ -301,7 +359,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                    match bytes.next() {
                        Some(b'(') => {
                            // Advance past the `\(` before using the expr parser
-                            state = advance_state!(state, 2)?;
+                            state.advance_mut(2);

                            let original_byte_count = state.bytes().len();

@ -328,7 +386,7 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
                        }
                        Some(b'u') => {
                            // Advance past the `\u` before using the expr parser
-                            state = advance_state!(state, 2)?;
+                            state.advance_mut(2);

                            let original_byte_count = state.bytes().len();

@ -386,11 +444,11 @@ pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
        Err((
            MadeProgress,
            if is_multiline {
-                EString::EndlessMulti(state.pos())
+                EString::EndlessMulti(start_state.pos())
            } else {
-                EString::EndlessSingle(state.pos())
+                EString::EndlessSingle(start_state.pos())
            },
-            state,
+            start_state,
        ))
    }
 }
--- a/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.result-ast
+++ b/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.result-ast
@ -0,0 +1,114 @@
+Defs(
+    Defs {
+        tags: [
+            Index(2147483648),
+            Index(2147483649),
+            Index(2147483650),
+        ],
+        regions: [
+            @0-22,
+            @23-49,
+            @50-92,
+        ],
+        space_before: [
+            Slice(start = 0, length = 0),
+            Slice(start = 0, length = 1),
+            Slice(start = 1, length = 1),
+        ],
+        space_after: [
+            Slice(start = 0, length = 0),
+            Slice(start = 1, length = 0),
+            Slice(start = 2, length = 0),
+        ],
+        spaces: [
+            Newline,
+            Newline,
+        ],
+        type_defs: [],
+        value_defs: [
+            Body(
+                @0-1 Identifier(
+                    "a",
+                ),
+                @4-22 Str(
+                    Line(
+                        [
+                            Plaintext(
+                                "Hello,",
+                            ),
+                            EscapedChar(
+                                Newline,
+                            ),
+                            EscapedChar(
+                                Newline,
+                            ),
+                            Plaintext(
+                                "World!",
+                            ),
+                        ],
+                    ),
+                ),
+            ),
+            Body(
+                @23-24 Identifier(
+                    "b",
+                ),
+                @27-49 Str(
+                    Block(
+                        [
+                            [
+                                Plaintext(
+                                    "Hello,",
+                                ),
+                                EscapedChar(
+                                    Newline,
+                                ),
+                                EscapedChar(
+                                    Newline,
+                                ),
+                                Plaintext(
+                                    "World!",
+                                ),
+                            ],
+                        ],
+                    ),
+                ),
+            ),
+            Body(
+                @50-51 Identifier(
+                    "c",
+                ),
+                @58-92 SpaceBefore(
+                    Str(
+                        Block(
+                            [
+                                [
+                                    Plaintext(
+                                        "Hello,\n",
+                                    ),
+                                    Plaintext(
+                                        "\n",
+                                    ),
+                                    Plaintext(
+                                        "World!",
+                                    ),
+                                ],
+                            ],
+                        ),
+                    ),
+                    [
+                        Newline,
+                    ],
+                ),
+            ),
+        ],
+    },
+    @93-95 SpaceBefore(
+        Num(
+            "42",
+        ),
+        [
+            Newline,
+        ],
+    ),
+)
--- a/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.roc
+++ b/crates/compiler/parse/tests/snapshots/pass/multiline_string.expr.roc
@ -0,0 +1,9 @@
+a = "Hello,\n\nWorld!"
+b = """Hello,\n\nWorld!"""
+c =
+    """
+    Hello,
+
+    World!
+    """
+42
--- a/crates/compiler/parse/tests/test_parse.rs
+++ b/crates/compiler/parse/tests/test_parse.rs
@ -204,6 +204,7 @@ mod test_parse {
        pass/not_docs.expr,
        pass/number_literal_suffixes.expr,
        pass/one_backpassing.expr,
+        pass/multiline_string.expr,
        pass/one_char_string.expr,
        pass/one_def.expr,
        pass/one_minus_two.expr,
--- a/crates/reporting/src/error/parse.rs
+++ b/crates/reporting/src/error/parse.rs
@ -921,6 +921,27 @@ fn to_str_report<'a>(
                severity: Severity::RuntimeError,
            }
        }
+        EString::MultilineInsufficientIndent(pos) => {
+            let surroundings = Region::new(start, pos);
+            let region = LineColumnRegion::from_pos(lines.convert_pos(pos));
+
+            let doc = alloc.stack([
+                alloc.reflow(r"This multiline string is not sufficiently indented:"),
+                alloc.region_with_subregion(lines.convert_region(surroundings), region),
+                alloc.concat([
+                    alloc.reflow(r"Lines in a multi-line string must be indented at least as "),
+                    alloc.reflow("much as the beginning \"\"\". This extra indentation is automatically removed "),
+                    alloc.reflow("from the string during compilation."),
+                ]),
+            ]);
+
+            Report {
+                filename,
+                doc,
+                title: "INSUFFICIENT INDENT IN MULTI-LINE STRING".to_string(),
+                severity: Severity::RuntimeError,
+            }
+        }
    }
 }
 fn to_expr_in_parens_report<'a>(
--- a/crates/reporting/tests/test_reporting.rs
+++ b/crates/reporting/tests/test_reporting.rs
@ -5228,6 +5228,23 @@ mod test_reporting {
    "###
    );

+    test_report!(
+        multi_insufficient_indent,
+        "    \"\"\"\n  testing\n    \"\"\"", // 4 space indent on the start, 2 space on the `testing` line
+        @r###"
+    ── INSUFFICIENT INDENT IN MULTI-LINE STRING ─ ..._insufficient_indent/Test.roc ─
+
+    This multiline string is not sufficiently indented:
+
+    5│        testing
+              ^
+
+    Lines in a multi-line string must be indented at least as much as the
+    beginning """. This extra indentation is automatically removed from
+    the string during compilation.
+    "###
+    );
+
    // https://github.com/rtfeldman/roc/issues/1714
    test_report!(
    interpolate_concat_is_transparent_1714,