Canonicalize unicode code point escapes

2024-09-20 15:27:45 +03:00 · 2020-08-30 22:21:19 -04:00 · 2020-08-30 22:21:19 -04:00 · 2e15443c8c
commit 2e15443c8c
parent 7682e09b0a
4 changed files with 125 additions and 61 deletions
--- a/compiler/can/src/expr.rs
+++ b/compiler/can/src/expr.rs
@ -22,7 +22,7 @@ use roc_region::all::{Located, Region};
 use roc_types::subs::{VarStore, Variable};
 use roc_types::types::Alias;
 use std::fmt::Debug;
-use std::i64;
+use std::{char, i64, u32};

 #[derive(Clone, Default, Debug, PartialEq)]
 pub struct Output {
@ -1359,7 +1359,7 @@ fn flatten_str_lines<'a>(
    use ast::StrSegment::*;

    let mut buf = String::new();
-    let mut interpolations = Vec::new();
+    let mut segments = Vec::new();
    let mut output = Output::default();

    for line in lines {
@ -1368,11 +1368,41 @@ fn flatten_str_lines<'a>(
                Plaintext(string) => {
                    buf.push_str(string);
                }
-                Unicode(loc_digits) => {
-                    todo!("parse unicode digits {:?}", loc_digits);
-                }
+                Unicode(loc_hex_digits) => match u32::from_str_radix(loc_hex_digits.value, 16) {
+                    Ok(code_pt) => match char::from_u32(code_pt) {
+                        Some(ch) => {
+                            buf.push(ch);
+                        }
+                        None => {
+                            env.problem(Problem::InvalidUnicodeCodePoint(loc_hex_digits.region));
+
+                            return (
+                                Expr::RuntimeError(RuntimeError::InvalidUnicodeCodePoint(
+                                    loc_hex_digits.region,
+                                )),
+                                output,
+                            );
+                        }
+                    },
+                    Err(_) => {
+                        env.problem(Problem::InvalidHexadecimal(loc_hex_digits.region));
+
+                        return (
+                            Expr::RuntimeError(RuntimeError::InvalidHexadecimal(
+                                loc_hex_digits.region,
+                            )),
+                            output,
+                        );
+                    }
+                },
                Interpolated(loc_expr) => {
                    if is_valid_interpolation(loc_expr.value) {
+                        if !buf.is_empty() {
+                            segments.push(StrSegment::Plaintext(buf.into()));
+
+                            buf = String::new();
+                        }
+
                        let (loc_expr, new_output) = canonicalize_expr(
                            env,
                            var_store,
@ -1383,7 +1413,7 @@ fn flatten_str_lines<'a>(

                        output.union(new_output);

-                        interpolations.push(StrSegment::Interpolation(loc_expr));
+                        segments.push(StrSegment::Interpolation(loc_expr));
                    } else {
                        env.problem(Problem::InvalidInterpolation(loc_expr.region));

@ -1398,7 +1428,11 @@ fn flatten_str_lines<'a>(
        }
    }

-    (Expr::Str(interpolations), output)
+    if !buf.is_empty() {
+        segments.push(StrSegment::Plaintext(buf.into()));
+    }
+
+    (Expr::Str(segments), output)
 }

 /// Returns the char that would have been originally parsed to
--- a/compiler/can/tests/test_can.rs
+++ b/compiler/can/tests/test_can.rs
@ -15,7 +15,7 @@ mod test_can {
    use crate::helpers::{can_expr_with, test_home, CanExprOut};
    use bumpalo::Bump;
    use roc_can::expr::Expr::{self, *};
-    use roc_can::expr::Recursive;
+    use roc_can::expr::{Recursive, StrSegment};
    use roc_problem::can::{FloatErrorKind, IntErrorKind, Problem, RuntimeError};
    use roc_region::all::Region;
    use std::{f64, i64};
@ -69,6 +69,10 @@ mod test_can {
        }
    }

+    fn expr_str(contents: &str) -> Expr {
+        Expr::Str(vec![StrSegment::Plaintext(contents.into())])
+    }
+
    // NUMBER LITERALS

    #[test]
@ -1179,62 +1183,61 @@ mod test_can {
    //}
    //
    //
-    //// STRING LITERALS
+    // STRING LITERALS

-    //
-    // #[test]
-    // fn string_with_valid_unicode_escapes() {
-    //     expect_parsed_str("x\u{00A0}x", r#""x\u{00A0}x""#);
-    //     expect_parsed_str("x\u{101010}x", r#""x\u{101010}x""#);
-    // }
+    #[test]
+    fn string_with_valid_unicode_escapes() {
+        assert_can(r#""x\u(00A0)x""#, expr_str("x\u{00A0}x"));
+        assert_can(r#""x\u(101010)x""#, expr_str("x\u{101010}x"));
+    }

-    // #[test]
-    // fn string_with_too_large_unicode_escape() {
-    //     // Should be too big - max size should be 10FFFF.
-    //     // (Rust has this restriction. I assume it's a good idea.)
-    //     assert_malformed_str(
-    //         r#""abc\u{110000}def""#,
-    //         vec![Located::new(0, 7, 0, 12, Problem::UnicodeCodePointTooLarge)],
-    //     );
-    // }
+    //     #[test]
+    //     fn string_with_too_large_unicode_escape() {
+    //         // Should be too big - max size should be 10FFFF.
+    //         // (Rust has this restriction. I assume it's a good idea.)
+    //         assert_malformed_str(
+    //             r#""abc\u{110000}def""#,
+    //             vec![Located::new(0, 7, 0, 12, Problem::UnicodeCodePointTooLarge)],
+    //         );
+    //     }

-    // #[test]
-    // fn string_with_no_unicode_digits() {
-    //     // No digits specified
-    //     assert_malformed_str(
-    //         r#""blah\u{}foo""#,
-    //         vec![Located::new(0, 5, 0, 8, Problem::NoUnicodeDigits)],
-    //     );
-    // }
+    //     #[test]
+    //     fn string_with_no_unicode_digits() {
+    //         // No digits specified
+    //         assert_malformed_str(
+    //             r#""blah\u{}foo""#,
+    //             vec![Located::new(0, 5, 0, 8, Problem::NoUnicodeDigits)],
+    //         );
+    //     }

-    // #[test]
-    // fn string_with_no_unicode_opening_brace() {
-    //     // No opening curly brace. It can't be sure if the closing brace
-    //     // was intended to be a closing brace for the unicode escape, so
-    //     // report that there were no digits specified.
-    //     assert_malformed_str(
-    //         r#""abc\u00A0}def""#,
-    //         vec![Located::new(0, 4, 0, 5, Problem::NoUnicodeDigits)],
-    //     );
-    // }
+    //     #[test]
+    //     fn string_with_no_unicode_opening_brace() {
+    //         // No opening curly brace. It can't be sure if the closing brace
+    //         // was intended to be a closing brace for the unicode escape, so
+    //         // report that there were no digits specified.
+    //         assert_malformed_str(
+    //             r#""abc\u00A0}def""#,
+    //             vec![Located::new(0, 4, 0, 5, Problem::NoUnicodeDigits)],
+    //         );
+    //     }

-    // #[test]
-    // fn string_with_no_unicode_closing_brace() {
-    //     // No closing curly brace
-    //     assert_malformed_str(
-    //         r#""blah\u{stuff""#,
-    //         vec![Located::new(0, 5, 0, 12, Problem::MalformedEscapedUnicode)],
-    //     );
-    // }
+    //     #[test]
+    //     fn string_with_no_unicode_closing_brace() {
+    //         // No closing curly brace
+    //         assert_malformed_str(
+    //             r#""blah\u{stuff""#,
+    //             vec![Located::new(0, 5, 0, 12, Problem::MalformedEscapedUnicode)],
+    //         );
+    //     }

-    // #[test]
-    // fn string_with_no_unicode_braces() {
-    //     // No curly braces
-    //     assert_malformed_str(
-    //         r#""zzzz\uzzzzz""#,
-    //         vec![Located::new(0, 5, 0, 6, Problem::NoUnicodeDigits)],
-    //     );
-    // }
+    //     #[test]
+    //     fn string_with_no_unicode_braces() {
+    //         // No curly braces
+    //         assert_malformed_str(
+    //             r#""zzzz\uzzzzz""#,
+    //             vec![Located::new(0, 5, 0, 6, Problem::NoUnicodeDigits)],
+    //         );
+    //     }

    //     #[test]
    //     fn string_with_escaped_interpolation() {
@ -1242,13 +1245,12 @@ mod test_can {
    //             // This should NOT be string interpolation, because of the \\
    //             indoc!(
    //                 r#"
-    //                 "abcd\\(efg)hij"
-    //                 "#
+    //                      "abcd\\(efg)hij"
+    //                      "#
    //             ),
    //             Str(r#"abcd\(efg)hij"#.into()),
    //         );
    //     }
-    //

    //     #[test]
    //     fn string_without_escape() {
--- a/compiler/problem/src/can.rs
+++ b/compiler/problem/src/can.rs
@ -56,6 +56,8 @@ pub enum Problem {
        region: Region,
    },
    InvalidInterpolation(Region),
+    InvalidHexadecimal(Region),
+    InvalidUnicodeCodePoint(Region),
 }

 #[derive(Clone, Debug, PartialEq)]
@ -127,6 +129,8 @@ pub enum RuntimeError {
    NonExhaustivePattern,

    InvalidInterpolation(Region),
+    InvalidHexadecimal(Region),
+    InvalidUnicodeCodePoint(Region),

    /// When the author specifies a type annotation but no implementation
    NoImplementation,
--- a/compiler/reporting/src/error/canonicalize.rs
+++ b/compiler/reporting/src/error/canonicalize.rs
@ -262,6 +262,18 @@ pub fn can_problem<'b>(
                alloc.reflow(" can occur in this position."),
            ]),
        ]),
+        Problem::InvalidHexadecimal(region) => {
+            todo!(
+                "TODO report an invalid hexadecimal number in a \\u(...) code point at region {:?}",
+                region
+            );
+        }
+        Problem::InvalidUnicodeCodePoint(region) => {
+            todo!(
+                "TODO report an invalid \\u(...) code point at region {:?}",
+                region
+            );
+        }
        Problem::InvalidInterpolation(region) => {
            todo!(
                "TODO report an invalid string interpolation at region {:?}",
@ -530,6 +542,18 @@ fn pretty_runtime_error<'b>(
            alloc.region(region),
            alloc.reflow("Only variables can be updated with record update syntax."),
        ]),
+        RuntimeError::InvalidHexadecimal(region) => {
+            todo!(
+                "TODO runtime error for an invalid hexadecimal number in a \\u(...) code point at region {:?}",
+                region
+            );
+        }
+        RuntimeError::InvalidUnicodeCodePoint(region) => {
+            todo!(
+                "TODO runtime error for an invalid \\u(...) code point at region {:?}",
+                region
+            );
+        }
        RuntimeError::InvalidInterpolation(region) => {
            todo!(
                "TODO runtime error for an invalid string interpolation at region {:?}",