Merge pull request #432 from rtfeldman/utf8

Lazily validate UTF-8 when parsing
2024-09-22 00:09:33 +03:00 · 2020-07-27 01:59:44 -04:00 · 2020-07-27 01:59:44 -04:00 · 8b3dd6c90c
commit 8b3dd6c90c
parent baa0a5030e 99e41c73f3
22 changed files with 1024 additions and 833 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -527,6 +527,12 @@ version = "1.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"

+[[package]]
+name = "encode_unicode"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
+
 [[package]]
 name = "env_logger"
 version = "0.6.2"
@ -2229,6 +2235,7 @@ name = "roc_parse"
 version = "0.1.0"
 dependencies = [
 "bumpalo",
+ "encode_unicode",
 "indoc",
 "inlinable_string",
 "pretty_assertions",
--- a/cli/src/repl.rs
+++ b/cli/src/repl.rs
@ -33,6 +33,7 @@ use roc_types::types::Type;
 use std::hash::Hash;
 use std::io::{self, Write};
 use std::path::PathBuf;
+use std::str::from_utf8_unchecked;
 use target_lexicon::Triple;

 pub fn main() -> io::Result<()> {
@ -145,7 +146,7 @@ fn report_parse_error(fail: Fail) {
 }

 fn print_output(src: &str) -> Result<String, Fail> {
-    gen(src, Triple::host(), OptLevel::Normal).map(|(answer, answer_type)| {
+    gen(src.as_bytes(), Triple::host(), OptLevel::Normal).map(|(answer, answer_type)| {
        format!("\n{} \u{001b}[35m:\u{001b}[0m {}", answer, answer_type)
    })
 }
@ -154,7 +155,7 @@ pub fn repl_home() -> ModuleId {
    ModuleIds::default().get_or_insert(&"REPL".into())
 }

-pub fn gen(src: &str, target: Triple, opt_level: OptLevel) -> Result<(String, String), Fail> {
+pub fn gen(src: &[u8], target: Triple, opt_level: OptLevel) -> Result<(String, String), Fail> {
    use roc_reporting::report::{can_problem, type_problem, RocDocAllocator, DEFAULT_PALETTE};

    // Look up the types and expressions of the `provided` values
@ -169,13 +170,16 @@ pub fn gen(src: &str, target: Triple, opt_level: OptLevel) -> Result<(String, St
        interns,
        problems: can_problems,
        ..
-    } = can_expr(src)?;
+    } = can_expr(src)?; // IMPORTANT: we must bail out here if there were UTF-8 errors!
+
    let subs = Subs::new(var_store.into());
    let mut type_problems = Vec::new();
    let (content, mut subs) = infer_expr(subs, &mut type_problems, &constraint, var);

+    // SAFETY: we've already verified that this is valid UTF-8 during parsing.
+    let src_lines: Vec<&str> = unsafe { from_utf8_unchecked(src).split('\n').collect() };
+
    // Report problems
-    let src_lines: Vec<&str> = src.split('\n').collect();
    let palette = DEFAULT_PALETTE;

    // Report parsing and canonicalization problems
@ -386,8 +390,11 @@ pub fn infer_expr(
    (content, solved.into_inner())
 }

-pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+pub fn parse_loc_with<'a>(
+    arena: &'a Bump,
+    bytes: &'a [u8],
+) -> Result<Located<ast::Expr<'a>>, Fail> {
+    let state = State::new(&bytes, Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

@ -396,14 +403,14 @@ pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast
        .map_err(|(fail, _)| fail)
 }

-pub fn can_expr(expr_str: &str) -> Result<CanExprOut, Fail> {
-    can_expr_with(&Bump::new(), repl_home(), expr_str)
+pub fn can_expr(expr_bytes: &[u8]) -> Result<CanExprOut, Fail> {
+    can_expr_with(&Bump::new(), repl_home(), expr_bytes)
 }

 // TODO make this return a named struct instead of a big tuple
 #[allow(clippy::type_complexity)]
 pub fn uniq_expr(
-    expr_str: &str,
+    expr_bytes: &[u8],
 ) -> Result<
    (
        Located<roc_can::expr::Expr>,
@ -419,14 +426,14 @@ pub fn uniq_expr(
 > {
    let declared_idents: &ImMap<Ident, (Symbol, Region)> = &ImMap::default();

-    uniq_expr_with(&Bump::new(), expr_str, declared_idents)
+    uniq_expr_with(&Bump::new(), expr_bytes, declared_idents)
 }

 // TODO make this return a named struct instead of a big tuple
 #[allow(clippy::type_complexity)]
 pub fn uniq_expr_with(
    arena: &Bump,
-    expr_str: &str,
+    expr_bytes: &[u8],
    declared_idents: &ImMap<Ident, (Symbol, Region)>,
 ) -> Result<
    (
@ -450,7 +457,7 @@ pub fn uniq_expr_with(
        var,
        interns,
        ..
-    } = can_expr_with(arena, home, expr_str)?;
+    } = can_expr_with(arena, home, expr_bytes)?;

    // double check
    let mut var_store = VarStore::new(old_var_store.fresh());
@ -505,8 +512,8 @@ pub struct CanExprOut {
    pub constraint: Constraint,
 }

-pub fn can_expr_with(arena: &Bump, home: ModuleId, expr_str: &str) -> Result<CanExprOut, Fail> {
-    let loc_expr = parse_loc_with(&arena, expr_str)?;
+pub fn can_expr_with(arena: &Bump, home: ModuleId, expr_bytes: &[u8]) -> Result<CanExprOut, Fail> {
+    let loc_expr = parse_loc_with(&arena, expr_bytes)?;
    let mut var_store = VarStore::default();
    let var = var_store.fresh();
    let expected = Expected::NoExpectation(Type::Variable(var));
--- a/compiler/can/tests/helpers/mod.rs
+++ b/compiler/can/tests/helpers/mod.rs
@ -27,7 +27,7 @@ pub fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<ast::Expr<'a>,

 #[allow(dead_code)]
 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

--- a/compiler/fmt/tests/test_fmt.rs
+++ b/compiler/fmt/tests/test_fmt.rs
@ -20,7 +20,7 @@ mod test_fmt {
    use roc_parse::parser::{Fail, Parser, State};

    fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Expr<'a>, Fail> {
-        let state = State::new(&input, Attempting::Module);
+        let state = State::new(input.as_bytes(), Attempting::Module);
        let parser = space0_before(loc!(roc_parse::expr::expr(0)), 0);
        let answer = parser.parse(&arena, state);

@ -55,7 +55,7 @@ mod test_fmt {
        let src = src.trim_end();
        let expected = expected.trim_end();

-        match module::header().parse(&arena, State::new(&src, Attempting::Module)) {
+        match module::header().parse(&arena, State::new(src.as_bytes(), Attempting::Module)) {
            Ok((actual, state)) => {
                let mut buf = String::new_in(&arena);

--- a/compiler/gen/tests/helpers/mod.rs
+++ b/compiler/gen/tests/helpers/mod.rs
@ -87,7 +87,7 @@ pub fn infer_expr(
 }

 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

--- a/compiler/load/src/file.rs
+++ b/compiler/load/src/file.rs
@ -19,9 +19,10 @@ use roc_solve::solve;
 use roc_types::solved_types::Solved;
 use roc_types::subs::{Subs, VarStore, Variable};
 use std::collections::{HashMap, HashSet};
-use std::fs::read_to_string;
+use std::fs;
 use std::io;
 use std::path::{Path, PathBuf};
+use std::str::from_utf8_unchecked;
 use std::sync::{Arc, Mutex};
 use tokio::sync::mpsc;
 use tokio::task::spawn_blocking;
@ -63,7 +64,7 @@ struct ModuleHeader {
    imported_modules: MutSet<ModuleId>,
    exposes: Vec<Symbol>,
    exposed_imports: MutMap<Ident, (Symbol, Region)>,
-    src: Box<str>,
+    src: Box<[u8]>,
 }

 #[derive(Debug)]
@ -526,58 +527,70 @@ fn load_module(
    load_filename(filename, msg_tx, module_ids)
 }

+fn parse_src(
+    filename: PathBuf,
+    msg_tx: MsgSender,
+    module_ids: SharedModules<'_, '_>,
+    src_bytes: &[u8],
+) -> Result<ModuleId, LoadingProblem> {
+    let state = State::new(src_bytes, Attempting::Module);
+    let arena = Bump::new();
+
+    // TODO figure out if there's a way to address this clippy error
+    // without introducing a borrow error. ("let and return" is literally
+    // what the borrow checker suggested using here to fix the problem, so...)
+    #[allow(clippy::let_and_return)]
+    let answer = match roc_parse::module::header().parse(&arena, state) {
+        Ok((ast::Module::Interface { header }, state)) => {
+            let module_id = send_header(
+                header.name,
+                header.exposes.into_bump_slice(),
+                header.imports.into_bump_slice(),
+                state,
+                module_ids,
+                msg_tx,
+            );
+
+            Ok(module_id)
+        }
+        Ok((ast::Module::App { header }, state)) => match module_ids {
+            MaybeShared::Shared(_, _) => {
+                // If this is Shared, it means we're trying to import
+                // an app module which is not the root. Not alllowed!
+                Err(LoadingProblem::TriedToImportAppModule)
+            }
+            unique_modules @ MaybeShared::Unique(_, _) => {
+                let module_id = send_header(
+                    header.name,
+                    header.provides.into_bump_slice(),
+                    header.imports.into_bump_slice(),
+                    state,
+                    unique_modules,
+                    msg_tx,
+                );
+
+                Ok(module_id)
+            }
+        },
+        Err((fail, _)) => Err(LoadingProblem::ParsingFailed { filename, fail }),
+    };
+
+    answer
+}
+
 /// Load a module by its filename
+///
+/// This has two unsafe calls:
+///
+/// * memory map the filename instead of doing a buffered read
+/// * assume the contents of the file are valid UTF-8
 fn load_filename(
    filename: PathBuf,
    msg_tx: MsgSender,
    module_ids: SharedModules<'_, '_>,
 ) -> Result<ModuleId, LoadingProblem> {
-    match read_to_string(&filename) {
-        Ok(src) => {
-            let arena = Bump::new();
-            let state = State::new(&src, Attempting::Module);
-
-            // TODO figure out if there's a way to address this clippy error
-            // without introducing a borrow error. ("let and return" is literally
-            // what the borrow checker suggested using here to fix the problem, so...)
-            #[allow(clippy::let_and_return)]
-            let answer = match roc_parse::module::header().parse(&arena, state) {
-                Ok((ast::Module::Interface { header }, state)) => {
-                    let module_id = send_header(
-                        header.name,
-                        header.exposes.into_bump_slice(),
-                        header.imports.into_bump_slice(),
-                        state,
-                        module_ids,
-                        msg_tx,
-                    );
-
-                    Ok(module_id)
-                }
-                Ok((ast::Module::App { header }, state)) => match module_ids {
-                    MaybeShared::Shared(_, _) => {
-                        // If this is Shared, it means we're trying to import
-                        // an app module which is not the root. Not alllowed!
-                        Err(LoadingProblem::TriedToImportAppModule)
-                    }
-                    unique_modules @ MaybeShared::Unique(_, _) => {
-                        let module_id = send_header(
-                            header.name,
-                            header.provides.into_bump_slice(),
-                            header.imports.into_bump_slice(),
-                            state,
-                            unique_modules,
-                            msg_tx,
-                        );
-
-                        Ok(module_id)
-                    }
-                },
-                Err((fail, _)) => Err(LoadingProblem::ParsingFailed { filename, fail }),
-            };
-
-            answer
-        }
+    match fs::read(&filename) {
+        Ok(bytes) => parse_src(filename, msg_tx, module_ids, bytes.as_ref()),
        Err(err) => Err(LoadingProblem::FileProblem {
            filename,
            error: err.kind(),
@ -746,7 +759,7 @@ fn send_header<'a>(

    // Box up the input &str for transfer over the wire.
    // We'll need this in order to continue parsing later.
-    let src: Box<str> = state.input.to_string().into();
+    let src: Box<[u8]> = state.bytes.into();

    // Send the deps to the coordinator thread for processing,
    // then continue on to parsing and canonicalizing defs.
@ -961,7 +974,7 @@ fn parse_and_constrain(

    let (parsed_defs, _) = module_defs()
        .parse(&arena, state)
-        .expect("TODO gracefully handle parse error on module defs");
+        .expect("TODO gracefully handle parse error on module defs. IMPORTANT: Bail out entirely if there are any BadUtf8 problems! That means the whole source file is not valid UTF-8 and any other errors we report may get mis-reported. We rely on this for safety in an `unsafe` block later on in this function.");

    let (module, declarations, ident_ids, constraint, problems) = match canonicalize_module_defs(
        &arena,
@ -1001,9 +1014,13 @@ fn parse_and_constrain(
        }
    };

-    let src = header.src;
    let imported_modules = header.imported_modules;

+    // SAFETY: By this point we've already incrementally verified that there
+    // are no UTF-8 errors in these bytes. If there had been any UTF-8 errors,
+    // we'd have bailed out before now.
+    let src: Box<str> = unsafe { from_utf8_unchecked(header.src.as_ref()).to_string().into() };
+
    tokio::spawn(async move {
        let mut tx = msg_tx;

--- a/compiler/load/tests/helpers/mod.rs
+++ b/compiler/load/tests/helpers/mod.rs
@ -92,7 +92,7 @@ pub fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<ast::Expr<'a>,

 #[allow(dead_code)]
 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

--- a/compiler/mono/tests/helpers/mod.rs
+++ b/compiler/mono/tests/helpers/mod.rs
@ -53,7 +53,7 @@ pub fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<ast::Expr<'a>,

 #[allow(dead_code)]
 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

--- a/compiler/parse/Cargo.toml
+++ b/compiler/parse/Cargo.toml
@ -11,6 +11,7 @@ roc_region = { path = "../region" }
 roc_module = { path = "../module" }
 bumpalo = { version = "3.2", features = ["collections"] }
 inlinable_string = "0.1"
+encode_unicode = "0.3"

 [dev-dependencies]
 pretty_assertions = "0.5.1"
--- a/compiler/parse/src/blankspace.rs
+++ b/compiler/parse/src/blankspace.rs
@ -1,6 +1,8 @@
 use crate::ast::CommentOrNewline::{self, *};
 use crate::ast::Spaceable;
-use crate::parser::{self, and, unexpected, unexpected_eof, Parser, State};
+use crate::parser::{
+    self, and, peek_utf8_char, unexpected, unexpected_eof, FailReason, Parser, State,
+};
 use bumpalo::collections::string::String;
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
@ -216,147 +218,179 @@ fn spaces<'a>(
 ) -> impl Parser<'a, &'a [CommentOrNewline<'a>]> {
    move |arena: &'a Bump, state: State<'a>| {
        let original_state = state.clone();
-        let chars = state.input.chars().peekable();
        let mut space_list = Vec::new_in(arena);
-        let mut chars_parsed = 0;
+        let mut bytes_parsed = 0;
        let mut comment_line_buf = String::new_in(arena);
        let mut line_state = LineState::Normal;
        let mut state = state;
        let mut any_newlines = false;

-        for ch in chars {
-            chars_parsed += 1;
+        while !state.bytes.is_empty() {
+            match peek_utf8_char(&state) {
+                Ok((ch, utf8_len)) => {
+                    bytes_parsed += utf8_len;

-            match line_state {
-                LineState::Normal => {
-                    match ch {
-                        ' ' => {
-                            // Don't check indentation here; it might not be enough
-                            // indentation yet, but maybe it will be after more spaces happen!
-                            state = state.advance_spaces(1)?;
-                        }
-                        '\r' => {
-                            // Ignore carriage returns.
-                            state = state.advance_spaces(1)?;
-                        }
-                        '\n' => {
-                            // No need to check indentation because we're about to reset it anyway.
-                            state = state.newline()?;
+                    match line_state {
+                        LineState::Normal => {
+                            match ch {
+                                ' ' => {
+                                    // Don't check indentation here; it might not be enough
+                                    // indentation yet, but maybe it will be after more spaces happen!
+                                    state = state.advance_spaces(1)?;
+                                }
+                                '\r' => {
+                                    // Ignore carriage returns.
+                                    state = state.advance_spaces(1)?;
+                                }
+                                '\n' => {
+                                    // No need to check indentation because we're about to reset it anyway.
+                                    state = state.newline()?;

-                            // Newlines only get added to the list when they're outside comments.
-                            space_list.push(Newline);
+                                    // Newlines only get added to the list when they're outside comments.
+                                    space_list.push(Newline);

-                            any_newlines = true;
-                        }
-                        '#' => {
-                            // Check indentation to make sure we were indented enough
-                            // before this comment began.
-                            state = state
-                                .check_indent(min_indent)
-                                .map_err(|(fail, _)| (fail, original_state.clone()))?
-                                .advance_without_indenting(1)?;
-
-                            // We're now parsing a line comment!
-                            line_state = LineState::Comment;
-                        }
-                        nonblank => {
-                            return if require_at_least_one && chars_parsed <= 1 {
-                                // We've parsed 1 char and it was not a space,
-                                // but we require parsing at least one space!
-                                Err(unexpected(nonblank, 0, state.clone(), state.attempting))
-                            } else {
-                                // First make sure we were indented enough!
-                                //
-                                // (We only do this if we've encountered any newlines.
-                                // Otherwise, we assume indentation is already correct.
-                                // It's actively important for correctness that we skip
-                                // this check if there are no newlines, because otherwise
-                                // we would have false positives for single-line defs.)
-                                if any_newlines {
+                                    any_newlines = true;
+                                }
+                                '#' => {
+                                    // Check indentation to make sure we were indented enough
+                                    // before this comment began.
                                    state = state
                                        .check_indent(min_indent)
-                                        .map_err(|(fail, _)| (fail, original_state))?;
+                                        .map_err(|(fail, _)| (fail, original_state.clone()))?
+                                        .advance_without_indenting(1)?;
+
+                                    // We're now parsing a line comment!
+                                    line_state = LineState::Comment;
                                }
-
-                                Ok((space_list.into_bump_slice(), state))
-                            };
-                        }
-                    }
-                }
-                LineState::Comment => {
-                    match ch {
-                        ' ' => {
-                            // If we're in a line comment, this won't affect indentation anyway.
-                            state = state.advance_without_indenting(1)?;
-
-                            if comment_line_buf.len() == 1 {
-                                match comment_line_buf.chars().next() {
-                                    Some('#') => {
-                                        // This is a comment begining with `## ` - that is,
-                                        // a doc comment.
+                                _ => {
+                                    return if require_at_least_one && bytes_parsed <= 1 {
+                                        // We've parsed 1 char and it was not a space,
+                                        // but we require parsing at least one space!
+                                        Err(unexpected(0, state.clone(), state.attempting))
+                                    } else {
+                                        // First make sure we were indented enough!
                                        //
-                                        // (The space is important; otherwise, this is not
-                                        // a doc comment, but rather something like a
-                                        // big separator block, e.g. ############)
-                                        line_state = LineState::DocComment;
+                                        // (We only do this if we've encountered any newlines.
+                                        // Otherwise, we assume indentation is already correct.
+                                        // It's actively important for correctness that we skip
+                                        // this check if there are no newlines, because otherwise
+                                        // we would have false positives for single-line defs.)
+                                        if any_newlines {
+                                            state = state
+                                                .check_indent(min_indent)
+                                                .map_err(|(fail, _)| (fail, original_state))?;
+                                        }

-                                        // This is now the beginning of the doc comment.
-                                        comment_line_buf.clear();
-                                    }
-                                    _ => {
+                                        Ok((space_list.into_bump_slice(), state))
+                                    };
+                                }
+                            }
+                        }
+                        LineState::Comment => {
+                            match ch {
+                                ' ' => {
+                                    // If we're in a line comment, this won't affect indentation anyway.
+                                    state = state.advance_without_indenting(1)?;
+
+                                    if comment_line_buf.len() == 1 {
+                                        match comment_line_buf.chars().next() {
+                                            Some('#') => {
+                                                // This is a comment begining with `## ` - that is,
+                                                // a doc comment.
+                                                //
+                                                // (The space is important; otherwise, this is not
+                                                // a doc comment, but rather something like a
+                                                // big separator block, e.g. ############)
+                                                line_state = LineState::DocComment;
+
+                                                // This is now the beginning of the doc comment.
+                                                comment_line_buf.clear();
+                                            }
+                                            _ => {
+                                                comment_line_buf.push(ch);
+                                            }
+                                        }
+                                    } else {
                                        comment_line_buf.push(ch);
                                    }
                                }
-                            } else {
-                                comment_line_buf.push(ch);
+                                '\n' => {
+                                    state = state.newline()?;
+
+                                    // This was a newline, so end this line comment.
+                                    space_list.push(LineComment(comment_line_buf.into_bump_str()));
+                                    comment_line_buf = String::new_in(arena);
+
+                                    line_state = LineState::Normal;
+                                }
+                                nonblank => {
+                                    // Chars can have btye lengths of more than 1!
+                                    state = state.advance_without_indenting(nonblank.len_utf8())?;
+
+                                    comment_line_buf.push(nonblank);
+                                }
                            }
                        }
-                        '\n' => {
-                            state = state.newline()?;
+                        LineState::DocComment => {
+                            match ch {
+                                ' ' => {
+                                    // If we're in a doc comment, this won't affect indentation anyway.
+                                    state = state.advance_without_indenting(1)?;

-                            // This was a newline, so end this line comment.
-                            space_list.push(LineComment(comment_line_buf.into_bump_str()));
-                            comment_line_buf = String::new_in(arena);
+                                    comment_line_buf.push(ch);
+                                }
+                                '\n' => {
+                                    state = state.newline()?;

-                            line_state = LineState::Normal;
-                        }
-                        nonblank => {
-                            // Chars can have btye lengths of more than 1!
-                            state = state.advance_without_indenting(nonblank.len_utf8())?;
+                                    // This was a newline, so end this doc comment.
+                                    space_list.push(DocComment(comment_line_buf.into_bump_str()));
+                                    comment_line_buf = String::new_in(arena);

-                            comment_line_buf.push(nonblank);
+                                    line_state = LineState::Normal;
+                                }
+                                nonblank => {
+                                    state = state.advance_without_indenting(utf8_len)?;
+
+                                    comment_line_buf.push(nonblank);
+                                }
+                            }
                        }
                    }
                }
-                LineState::DocComment => {
-                    match ch {
-                        ' ' => {
-                            // If we're in a doc comment, this won't affect indentation anyway.
-                            state = state.advance_without_indenting(1)?;
+                Err(FailReason::BadUtf8) => {
+                    // If we hit an invalid UTF-8 character, bail out immediately.
+                    return state.fail(FailReason::BadUtf8);
+                }
+                Err(_) => {
+                    if require_at_least_one && bytes_parsed == 0 {
+                        return Err(unexpected_eof(0, state.attempting, state));
+                    } else {
+                        let space_slice = space_list.into_bump_slice();

-                            comment_line_buf.push(ch);
+                        // First make sure we were indented enough!
+                        //
+                        // (We only do this if we've encountered any newlines.
+                        // Otherwise, we assume indentation is already correct.
+                        // It's actively important for correctness that we skip
+                        // this check if there are no newlines, because otherwise
+                        // we would have false positives for single-line defs.)
+                        if any_newlines {
+                            return Ok((
+                                space_slice,
+                                state
+                                    .check_indent(min_indent)
+                                    .map_err(|(fail, _)| (fail, original_state))?,
+                            ));
                        }
-                        '\n' => {
-                            state = state.newline()?;

-                            // This was a newline, so end this doc comment.
-                            space_list.push(DocComment(comment_line_buf.into_bump_str()));
-                            comment_line_buf = String::new_in(arena);
-
-                            line_state = LineState::Normal;
-                        }
-                        nonblank => {
-                            // Chars can have btye lengths of more than 1!
-                            state = state.advance_without_indenting(nonblank.len_utf8())?;
-
-                            comment_line_buf.push(nonblank);
-                        }
+                        return Ok((space_slice, state));
                    }
                }
-            }
+            };
        }

-        if require_at_least_one && chars_parsed == 0 {
+        // If we didn't parse anything, return unexpected EOF
+        if require_at_least_one && original_state.bytes.len() == state.bytes.len() {
            Err(unexpected_eof(0, state.attempting, state))
        } else {
            // First make sure we were indented enough!
--- a/compiler/parse/src/expr.rs
+++ b/compiler/parse/src/expr.rs
@ -8,8 +8,8 @@ use crate::ident::{global_tag_or_ident, ident, lowercase_ident, Ident};
 use crate::keyword;
 use crate::number_literal::number_literal;
 use crate::parser::{
-    self, allocated, char, fail, not, not_followed_by, optional, sep_by1, string, then, unexpected,
-    unexpected_eof, Either, Fail, FailReason, ParseResult, Parser, State,
+    self, allocated, ascii_char, ascii_string, fail, not, not_followed_by, optional, sep_by1, then,
+    unexpected, unexpected_eof, Either, Fail, FailReason, ParseResult, Parser, State,
 };
 use crate::type_annotation;
 use bumpalo::collections::string::String;
@ -22,7 +22,7 @@ pub fn expr<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
    // Recursive parsers must not directly invoke functions which return (impl Parser),
    // as this causes rustc to stack overflow. Thus, parse_expr must be a
    // separate function which recurses by calling itself directly.
-    move |arena, state| parse_expr(min_indent, arena, state)
+    move |arena, state: State<'a>| parse_expr(min_indent, arena, state)
 }

 macro_rules! loc_parenthetical_expr {
@ -30,7 +30,7 @@ macro_rules! loc_parenthetical_expr {
    then(
        loc!(and!(
            between!(
-                char('('),
+                ascii_char('(' ),
                map_with_arena!(
                    space0_around(
                        loc!(move |arena, state| parse_expr($min_indent, arena, state)),
@ -43,7 +43,7 @@ macro_rules! loc_parenthetical_expr {
                        }
                    }
                ),
-                char(')')
+                ascii_char(')' )
            ),
            optional(either!(
                // There may optionally be function args after the ')'
@ -59,7 +59,7 @@ macro_rules! loc_parenthetical_expr {
                // as if there were any args they'd have consumed it anyway
                // e.g. in `((foo bar) baz.blah)` the `.blah` will be consumed by the `baz` parser
                either!(
-                    one_or_more!(skip_first!(char('.'), lowercase_ident())),
+                    one_or_more!(skip_first!(ascii_char('.' ), lowercase_ident())),
                    and!(space0($min_indent), equals_with_indent())
                )
            ))
@ -170,7 +170,7 @@ pub fn unary_op<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
    one_of!(
        map_with_arena!(
            and!(
-                loc!(char('!')),
+                loc!(ascii_char('!')),
                loc!(move |arena, state| parse_expr(min_indent, arena, state))
            ),
            |arena: &'a Bump, (loc_op, loc_expr): (Located<()>, Located<Expr<'a>>)| {
@ -179,7 +179,7 @@ pub fn unary_op<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
        ),
        map_with_arena!(
            and!(
-                loc!(char('-')),
+                loc!(ascii_char('-')),
                loc!(move |arena, state| parse_expr(min_indent, arena, state))
            ),
            |arena: &'a Bump, (loc_op, loc_expr): (Located<()>, Located<Expr<'a>>)| {
@ -450,9 +450,9 @@ pub fn loc_parenthetical_def<'a>(min_indent: u16) -> impl Parser<'a, Located<Exp
        let (loc_tuple, state) = loc!(and!(
            space0_after(
                between!(
-                    char('('),
+                    ascii_char('('),
                    space0_around(loc_pattern(min_indent), min_indent),
-                    char(')')
+                    ascii_char(')')
                ),
                min_indent,
            ),
@ -482,7 +482,7 @@ pub fn loc_parenthetical_def<'a>(min_indent: u16) -> impl Parser<'a, Located<Exp
 /// The '=' used in a def can't be followed by another '=' (or else it's actually
 /// an "==") and also it can't be followed by '>' (or else it's actually an "=>")
 fn equals_for_def<'a>() -> impl Parser<'a, ()> {
-    not_followed_by(char('='), one_of!(char('='), char('>')))
+    not_followed_by(ascii_char('='), one_of!(ascii_char('='), ascii_char('>')))
 }

 /// A definition, consisting of one of these:
@ -513,7 +513,7 @@ pub fn def<'a>(min_indent: u16) -> impl Parser<'a, Def<'a>> {
                ),
                // Annotation
                skip_first!(
-                    char(':'),
+                    ascii_char(':'),
                    // Spaces after the ':' (at a normal indentation level) and then the type.
                    // The type itself must be indented more than the pattern and ':'
                    space0_before(type_annotation::located(indented_more), indented_more)
@ -811,12 +811,12 @@ fn loc_parse_function_arg<'a>(

 fn reserved_keyword<'a>() -> impl Parser<'a, ()> {
    one_of!(
-        string(keyword::IF),
-        string(keyword::THEN),
-        string(keyword::ELSE),
-        string(keyword::WHEN),
-        string(keyword::IS),
-        string(keyword::AS)
+        ascii_string(keyword::IF),
+        ascii_string(keyword::THEN),
+        ascii_string(keyword::ELSE),
+        ascii_string(keyword::WHEN),
+        ascii_string(keyword::IS),
+        ascii_string(keyword::AS)
    )
 }

@ -824,7 +824,7 @@ fn closure<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
    map_with_arena!(
        skip_first!(
            // All closures start with a '\' - e.g. (\x -> x + 1)
-            char('\\'),
+            ascii_char('\\'),
            // Once we see the '\', we're committed to parsing this as a closure.
            // It may turn out to be malformed, but it is definitely a closure.
            optional(and!(
@ -833,13 +833,13 @@ fn closure<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
                    Attempting::ClosureParams,
                    // Params are comma-separated
                    sep_by1(
-                        char(','),
+                        ascii_char(','),
                        space0_around(loc_closure_param(min_indent), min_indent)
                    )
                ),
                skip_first!(
                    // Parse the -> which separates params from body
-                    string("->"),
+                    ascii_string("->"),
                    // Parse the body
                    attempt!(
                        Attempting::ClosureBody,
@ -877,9 +877,9 @@ fn parse_closure_param<'a>(
        // If you wrap it in parens, you can match any arbitrary pattern at all.
        // e.g. \User.UserId userId -> ...
        between!(
-            char('('),
+            ascii_char('('),
            space0_around(loc_pattern(min_indent), min_indent),
-            char(')')
+            ascii_char(')')
        )
    )
    .parse(arena, state)
@ -903,9 +903,9 @@ fn loc_pattern<'a>(min_indent: u16) -> impl Parser<'a, Located<Pattern<'a>>> {

 fn loc_parenthetical_pattern<'a>(min_indent: u16) -> impl Parser<'a, Located<Pattern<'a>>> {
    between!(
-        char('('),
+        ascii_char('('),
        move |arena, state| loc_pattern(min_indent).parse(arena, state),
-        char(')')
+        ascii_char(')')
    )
 }

@ -923,13 +923,13 @@ fn string_pattern<'a>() -> impl Parser<'a, Pattern<'a>> {
 }

 fn underscore_pattern<'a>() -> impl Parser<'a, Pattern<'a>> {
-    map!(char('_'), |_| Pattern::Underscore)
+    map!(ascii_char('_'), |_| Pattern::Underscore)
 }

 fn record_destructure<'a>(min_indent: u16) -> impl Parser<'a, Pattern<'a>> {
    then(
        collection!(
-            char('{'),
+            ascii_char('{'),
            move |arena: &'a bumpalo::Bump,
                  state: crate::parser::State<'a>|
                  -> crate::parser::ParseResult<'a, Located<crate::ast::Pattern<'a>>> {
@ -947,10 +947,13 @@ fn record_destructure<'a>(min_indent: u16) -> impl Parser<'a, Pattern<'a>> {
                // (This is true in both literals and types.)
                let (opt_loc_val, state) = crate::parser::optional(either!(
                    skip_first!(
-                        char(':'),
+                        ascii_char(':'),
                        space0_before(loc_pattern(min_indent), min_indent)
                    ),
-                    skip_first!(char('?'), space0_before(loc!(expr(min_indent)), min_indent))
+                    skip_first!(
+                        ascii_char('?'),
+                        space0_before(loc!(expr(min_indent)), min_indent)
+                    )
                ))
                .parse(arena, state)?;

@ -987,8 +990,8 @@ fn record_destructure<'a>(min_indent: u16) -> impl Parser<'a, Pattern<'a>> {

                Ok((answer, state))
            },
-            char(','),
-            char('}'),
+            ascii_char(','),
+            ascii_char('}'),
            min_indent
        ),
        move |_arena, state, loc_patterns| {
@ -1109,7 +1112,7 @@ mod when {
                            loc!(move |arena, state| parse_expr(min_indent, arena, state)),
                            min_indent,
                        ),
-                        string(keyword::IS)
+                        ascii_string(keyword::IS)
                    )
                )
            ),
@ -1132,7 +1135,7 @@ mod when {
    /// Parsing when with indentation.
    fn when_with_indent<'a>() -> impl Parser<'a, u16> {
        move |arena, state: State<'a>| {
-            string(keyword::WHEN)
+            ascii_string(keyword::WHEN)
                .parse(arena, state)
                .map(|((), state)| (state.indent_col, state))
        }
@ -1185,7 +1188,7 @@ mod when {
                }
            );

-            loop {
+            while !state.bytes.is_empty() {
                match branch_parser.parse(arena, state) {
                    Ok((next_output, next_state)) => {
                        state = next_state;
@ -1210,11 +1213,11 @@ mod when {
    ) -> impl Parser<'a, (Vec<'a, Located<Pattern<'a>>>, Option<Located<Expr<'a>>>)> {
        and!(
            sep_by1(
-                char('|'),
+                ascii_char('|'),
                space0_around(loc_pattern(min_indent), min_indent),
            ),
            optional(skip_first!(
-                string(keyword::IF),
+                ascii_string(keyword::IF),
                // TODO we should require space before the expression but not after
                space1_around(
                    loc!(move |arena, state| parse_expr(min_indent, arena, state)),
@ -1240,7 +1243,7 @@ mod when {
    /// Parsing the righthandside of a branch in a when conditional.
    fn branch_result<'a>(indent: u16) -> impl Parser<'a, Located<Expr<'a>>> {
        skip_first!(
-            string("->"),
+            ascii_string("->"),
            space0_before(
                loc!(move |arena, state| parse_expr(indent, arena, state)),
                indent,
@ -1253,7 +1256,7 @@ pub fn if_expr<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
    map_with_arena!(
        and!(
            skip_first!(
-                string(keyword::IF),
+                ascii_string(keyword::IF),
                space1_around(
                    loc!(move |arena, state| parse_expr(min_indent, arena, state)),
                    min_indent,
@ -1261,14 +1264,14 @@ pub fn if_expr<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
            ),
            and!(
                skip_first!(
-                    string(keyword::THEN),
+                    ascii_string(keyword::THEN),
                    space1_around(
                        loc!(move |arena, state| parse_expr(min_indent, arena, state)),
                        min_indent,
                    )
                ),
                skip_first!(
-                    string(keyword::ELSE),
+                    ascii_string(keyword::ELSE),
                    space1_before(
                        loc!(move |arena, state| parse_expr(min_indent, arena, state)),
                        min_indent,
@ -1310,10 +1313,15 @@ fn unary_negate_function_arg<'a>(min_indent: u16) -> impl Parser<'a, Located<Exp
                    // Try to parse a number literal *before* trying to parse unary negate,
                    // because otherwise (foo -1) will parse as (foo (Num.neg 1))
                    loc!(number_literal()),
-                    loc!(char('-'))
+                    loc!(ascii_char('-'))
                )
            ),
-            one_of!(char(' '), char('#'), char('\n'), char('>')),
+            one_of!(
+                ascii_char(' '),
+                ascii_char('#'),
+                ascii_char('\n'),
+                ascii_char('>')
+            ),
        ),
        move |arena, state, (spaces, num_or_minus_char)| {
            match num_or_minus_char {
@ -1530,17 +1538,15 @@ pub fn ident_without_apply<'a>() -> impl Parser<'a, Expr<'a>> {
 /// Like equals_for_def(), except it produces the indent_col of the state rather than ()
 pub fn equals_with_indent<'a>() -> impl Parser<'a, u16> {
    move |_arena, state: State<'a>| {
-        let mut iter = state.input.chars();
-
-        match iter.next() {
-            Some(ch) if ch == '=' => {
-                match iter.peekable().peek() {
+        match state.bytes.first() {
+            Some(&byte) if byte == b'=' => {
+                match state.bytes.get(1) {
                    // The '=' must not be followed by another `=` or `>`
                    // (See equals_for_def() for explanation)
-                    Some(next_ch) if next_ch != &'=' && next_ch != &'>' => {
+                    Some(&next_byte) if next_byte != b'=' && next_byte != b'>' => {
                        Ok((state.indent_col, state.advance_without_indenting(1)?))
                    }
-                    Some(next_ch) => Err(unexpected(*next_ch, 0, state, Attempting::Def)),
+                    Some(_) => Err(unexpected(0, state, Attempting::Def)),
                    None => Err(unexpected_eof(
                        1,
                        Attempting::Def,
@ -1548,21 +1554,17 @@ pub fn equals_with_indent<'a>() -> impl Parser<'a, u16> {
                    )),
                }
            }
-            Some(ch) => Err(unexpected(ch, 0, state, Attempting::Def)),
+            Some(_) => Err(unexpected(0, state, Attempting::Def)),
            None => Err(unexpected_eof(0, Attempting::Def, state)),
        }
    }
 }

 pub fn colon_with_indent<'a>() -> impl Parser<'a, u16> {
-    move |_arena, state: State<'a>| {
-        let mut iter = state.input.chars();
-
-        match iter.next() {
-            Some(ch) if ch == ':' => Ok((state.indent_col, state.advance_without_indenting(1)?)),
-            Some(ch) => Err(unexpected(ch, 0, state, Attempting::Def)),
-            None => Err(unexpected_eof(0, Attempting::Def, state)),
-        }
+    move |_arena, state: State<'a>| match state.bytes.first() {
+        Some(&byte) if byte == b':' => Ok((state.indent_col, state.advance_without_indenting(1)?)),
+        Some(_) => Err(unexpected(0, state, Attempting::Def)),
+        None => Err(unexpected_eof(0, Attempting::Def, state)),
    }
 }

@ -1606,32 +1608,32 @@ fn binop<'a>() -> impl Parser<'a, BinOp> {
        // with other valid operators (e.g. "<=" begins with "<") must
        // come before the shorter ones; otherwise, they will never
        // be reached because the shorter one will pass and consume!
-        map!(string("|>"), |_| BinOp::Pizza),
-        map!(string("=="), |_| BinOp::Equals),
-        map!(string("!="), |_| BinOp::NotEquals),
-        map!(string("&&"), |_| BinOp::And),
-        map!(string("||"), |_| BinOp::Or),
-        map!(char('+'), |_| BinOp::Plus),
-        map!(char('*'), |_| BinOp::Star),
-        map!(char('-'), |_| BinOp::Minus),
-        map!(string("//"), |_| BinOp::DoubleSlash),
-        map!(char('/'), |_| BinOp::Slash),
-        map!(string("<="), |_| BinOp::LessThanOrEq),
-        map!(char('<'), |_| BinOp::LessThan),
-        map!(string(">="), |_| BinOp::GreaterThanOrEq),
-        map!(char('>'), |_| BinOp::GreaterThan),
-        map!(char('^'), |_| BinOp::Caret),
-        map!(string("%%"), |_| BinOp::DoublePercent),
-        map!(char('%'), |_| BinOp::Percent)
+        map!(ascii_string("|>"), |_| BinOp::Pizza),
+        map!(ascii_string("=="), |_| BinOp::Equals),
+        map!(ascii_string("!="), |_| BinOp::NotEquals),
+        map!(ascii_string("&&"), |_| BinOp::And),
+        map!(ascii_string("||"), |_| BinOp::Or),
+        map!(ascii_char('+'), |_| BinOp::Plus),
+        map!(ascii_char('*'), |_| BinOp::Star),
+        map!(ascii_char('-'), |_| BinOp::Minus),
+        map!(ascii_string("//"), |_| BinOp::DoubleSlash),
+        map!(ascii_char('/'), |_| BinOp::Slash),
+        map!(ascii_string("<="), |_| BinOp::LessThanOrEq),
+        map!(ascii_char('<'), |_| BinOp::LessThan),
+        map!(ascii_string(">="), |_| BinOp::GreaterThanOrEq),
+        map!(ascii_char('>'), |_| BinOp::GreaterThan),
+        map!(ascii_char('^'), |_| BinOp::Caret),
+        map!(ascii_string("%%"), |_| BinOp::DoublePercent),
+        map!(ascii_char('%'), |_| BinOp::Percent)
    )
 }

 pub fn list_literal<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
    let elems = collection!(
-        char('['),
+        ascii_char('['),
        loc!(expr(min_indent)),
-        char(','),
-        char(']'),
+        ascii_char(','),
+        ascii_char(']'),
        min_indent
    );

@ -1673,9 +1675,11 @@ pub fn record_literal<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
                    };

                    // there can be field access, e.g. `{ x : 4 }.x`
-                    let (accesses, state) =
-                        optional(one_or_more!(skip_first!(char('.'), lowercase_ident())))
-                            .parse(arena, state)?;
+                    let (accesses, state) = optional(one_or_more!(skip_first!(
+                        ascii_char('.'),
+                        lowercase_ident()
+                    )))
+                    .parse(arena, state)?;

                    if let Some(fields) = accesses {
                        for field in fields {
@ -1768,7 +1772,7 @@ pub fn record_literal<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
 /// This is mainly for matching tags in closure params, e.g. \@Foo -> ...
 pub fn private_tag<'a>() -> impl Parser<'a, &'a str> {
    map_with_arena!(
-        skip_first!(char('@'), global_tag()),
+        skip_first!(ascii_char('@'), global_tag()),
        |arena: &'a Bump, name: &'a str| {
            let mut buf = String::with_capacity_in(1 + name.len(), arena);

--- a/compiler/parse/src/ident.rs
+++ b/compiler/parse/src/ident.rs
@ -1,6 +1,6 @@
 use crate::ast::Attempting;
 use crate::keyword;
-use crate::parser::{unexpected, unexpected_eof, Fail, FailReason, ParseResult, Parser, State};
+use crate::parser::{peek_utf8_char, unexpected, Fail, FailReason, ParseResult, Parser, State};
 use bumpalo::collections::string::String;
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
@ -67,129 +67,126 @@ impl<'a> Ident<'a> {
 /// Sometimes we may want to check for those later in the process, and give
 /// more contextually-aware error messages than "unexpected `if`" or the like.
 #[inline(always)]
-pub fn parse_ident<'a, I>(
+pub fn parse_ident<'a>(
    arena: &'a Bump,
-    chars: &mut I,
-    state: State<'a>,
-) -> ParseResult<'a, (Ident<'a>, Option<char>)>
-where
-    I: Iterator<Item = char>,
-{
+    mut state: State<'a>,
+) -> ParseResult<'a, (Ident<'a>, Option<char>)> {
    let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
    let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena);
    let mut noncapitalized_parts: Vec<&'a str> = Vec::new_in(arena);
    let mut is_capitalized;
    let is_accessor_fn;
    let mut is_private_tag = false;
-    let mut chars_parsed;

    // Identifiers and accessor functions must start with either a letter or a dot.
    // If this starts with neither, it must be something else!
-    match chars.next() {
-        Some(ch) => {
-            if ch == '@' {
-                // '@' must always be followed by a capital letter!
-                match chars.next() {
-                    Some(ch) if ch.is_uppercase() => {
-                        part_buf.push('@');
-                        part_buf.push(ch);
+    match peek_utf8_char(&state) {
+        Ok((first_ch, bytes_parsed)) => {
+            if first_ch.is_alphabetic() {
+                part_buf.push(first_ch);

-                        is_private_tag = true;
-                        is_capitalized = true;
-                        is_accessor_fn = false;
-
-                        chars_parsed = 2;
-                    }
-                    Some(ch) => {
-                        return Err(unexpected(ch, 0, state, Attempting::Identifier));
-                    }
-                    None => {
-                        return Err(unexpected_eof(0, Attempting::Identifier, state));
-                    }
-                }
-            } else if ch.is_alphabetic() {
-                part_buf.push(ch);
-
-                is_capitalized = ch.is_uppercase();
+                is_capitalized = first_ch.is_uppercase();
                is_accessor_fn = false;

-                chars_parsed = 1;
-            } else if ch == '.' {
+                state = state.advance_without_indenting(bytes_parsed)?;
+            } else if first_ch == '.' {
                is_capitalized = false;
                is_accessor_fn = true;

-                chars_parsed = 1;
+                state = state.advance_without_indenting(bytes_parsed)?;
+            } else if first_ch == '@' {
+                state = state.advance_without_indenting(bytes_parsed)?;
+
+                // '@' must always be followed by a capital letter!
+                match peek_utf8_char(&state) {
+                    Ok((next_ch, next_bytes_parsed)) => {
+                        if next_ch.is_uppercase() {
+                            state = state.advance_without_indenting(next_bytes_parsed)?;
+
+                            part_buf.push('@');
+                            part_buf.push(next_ch);
+
+                            is_private_tag = true;
+                            is_capitalized = true;
+                            is_accessor_fn = false;
+                        } else {
+                            return Err(unexpected(
+                                bytes_parsed + next_bytes_parsed,
+                                state,
+                                Attempting::Identifier,
+                            ));
+                        }
+                    }
+                    Err(reason) => return state.fail(reason),
+                }
            } else {
-                return Err(unexpected(ch, 0, state, Attempting::Identifier));
+                return Err(unexpected(0, state, Attempting::Identifier));
            }
        }
-        None => {
-            return Err(unexpected_eof(0, Attempting::Identifier, state));
+        Err(reason) => return state.fail(reason),
+    }
+
+    while !state.bytes.is_empty() {
+        match peek_utf8_char(&state) {
+            Ok((ch, bytes_parsed)) => {
+                // After the first character, only these are allowed:
+                //
+                // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
+                // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                // * A dot ('.')
+                if ch.is_alphabetic() {
+                    if part_buf.is_empty() {
+                        // Capitalization is determined by the first character in the part.
+                        is_capitalized = ch.is_uppercase();
+                    }
+
+                    part_buf.push(ch);
+                } else if ch.is_ascii_digit() {
+                    // Parts may not start with numbers!
+                    if part_buf.is_empty() {
+                        return malformed(
+                            Some(ch),
+                            arena,
+                            state,
+                            capitalized_parts,
+                            noncapitalized_parts,
+                        );
+                    }
+
+                    part_buf.push(ch);
+                } else if ch == '.' {
+                    // There are two posssible errors here:
+                    //
+                    // 1. Having two consecutive dots is an error.
+                    // 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
+                    if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
+                        return malformed(
+                            Some(ch),
+                            arena,
+                            state,
+                            capitalized_parts,
+                            noncapitalized_parts,
+                        );
+                    }
+
+                    if is_capitalized {
+                        capitalized_parts.push(part_buf.into_bump_str());
+                    } else {
+                        noncapitalized_parts.push(part_buf.into_bump_str());
+                    }
+
+                    // Now that we've recorded the contents of the current buffer, reset it.
+                    part_buf = String::new_in(arena);
+                } else {
+                    // This must be the end of the identifier. We're done!
+
+                    break;
+                }
+
+                state = state.advance_without_indenting(bytes_parsed)?;
+            }
+            Err(reason) => return state.fail(reason),
        }
-    };
-
-    let mut next_char = None;
-
-    while let Some(ch) = chars.next() {
-        // After the first character, only these are allowed:
-        //
-        // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
-        // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
-        // * A dot ('.')
-        if ch.is_alphabetic() {
-            if part_buf.is_empty() {
-                // Capitalization is determined by the first character in the part.
-                is_capitalized = ch.is_uppercase();
-            }
-
-            part_buf.push(ch);
-        } else if ch.is_ascii_digit() {
-            // Parts may not start with numbers!
-            if part_buf.is_empty() {
-                return malformed(
-                    Some(ch),
-                    arena,
-                    state,
-                    chars,
-                    capitalized_parts,
-                    noncapitalized_parts,
-                );
-            }
-
-            part_buf.push(ch);
-        } else if ch == '.' {
-            // There are two posssible errors here:
-            //
-            // 1. Having two consecutive dots is an error.
-            // 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
-            if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
-                return malformed(
-                    Some(ch),
-                    arena,
-                    state,
-                    chars,
-                    capitalized_parts,
-                    noncapitalized_parts,
-                );
-            }
-
-            if is_capitalized {
-                capitalized_parts.push(part_buf.into_bump_str());
-            } else {
-                noncapitalized_parts.push(part_buf.into_bump_str());
-            }
-
-            // Now that we've recorded the contents of the current buffer, reset it.
-            part_buf = String::new_in(arena);
-        } else {
-            // This must be the end of the identifier. We're done!
-
-            next_char = Some(ch);
-
-            break;
-        }
-
-        chars_parsed += 1;
    }

    if part_buf.is_empty() {
@ -200,10 +197,9 @@ where
        // If we made it this far and don't have a next_char, then necessarily
        // we have consumed a '.' char previously.
        return malformed(
-            next_char.or_else(|| Some('.')),
+            Some('.'),
            arena,
            state,
-            chars,
            capitalized_parts,
            noncapitalized_parts,
        );
@ -224,14 +220,7 @@ where

            Ident::AccessorFunction(value)
        } else {
-            return malformed(
-                None,
-                arena,
-                state,
-                chars,
-                capitalized_parts,
-                noncapitalized_parts,
-            );
+            return malformed(None, arena, state, capitalized_parts, noncapitalized_parts);
        }
    } else if noncapitalized_parts.is_empty() {
        // We have capitalized parts only, so this must be a tag.
@ -245,33 +234,19 @@ where
                    }
                } else {
                    // This is a qualified tag, which is not allowed!
-                    return malformed(
-                        None,
-                        arena,
-                        state,
-                        chars,
-                        capitalized_parts,
-                        noncapitalized_parts,
-                    );
+                    return malformed(None, arena, state, capitalized_parts, noncapitalized_parts);
                }
            }
            None => {
                // We had neither capitalized nor noncapitalized parts,
                // yet we made it this far. The only explanation is that this was
                // a stray '.' drifting through the cosmos.
-                return Err(unexpected('.', 1, state, Attempting::Identifier));
+                return Err(unexpected(1, state, Attempting::Identifier));
            }
        }
    } else if is_private_tag {
        // This is qualified field access with an '@' in front, which does not make sense!
-        return malformed(
-            None,
-            arena,
-            state,
-            chars,
-            capitalized_parts,
-            noncapitalized_parts,
-        );
+        return malformed(None, arena, state, capitalized_parts, noncapitalized_parts);
    } else {
        // We have multiple noncapitalized parts, so this must be field access.
        Ident::Access {
@ -280,22 +255,16 @@ where
        }
    };

-    let state = state.advance_without_indenting(chars_parsed)?;
-
-    Ok(((answer, next_char), state))
+    Ok(((answer, None), state))
 }

-fn malformed<'a, I>(
+fn malformed<'a>(
    opt_bad_char: Option<char>,
    arena: &'a Bump,
-    state: State<'a>,
-    chars: &mut I,
+    mut state: State<'a>,
    capitalized_parts: Vec<&'a str>,
    noncapitalized_parts: Vec<&'a str>,
-) -> ParseResult<'a, (Ident<'a>, Option<char>)>
-where
-    I: Iterator<Item = char>,
-{
+) -> ParseResult<'a, (Ident<'a>, Option<char>)> {
    // Reconstruct the original string that we've been parsing.
    let mut full_string = String::new_in(arena);

@ -311,30 +280,35 @@ where
    // Consume the remaining chars in the identifier.
    let mut next_char = None;

-    for ch in chars {
-        // We can't use ch.is_alphanumeric() here because that passes for
-        // things that are "numeric" but not ASCII digits, like `¾`
-        if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
-            full_string.push(ch);
-        } else {
-            next_char = Some(ch);
+    while !state.bytes.is_empty() {
+        match peek_utf8_char(&state) {
+            Ok((ch, bytes_parsed)) => {
+                // We can't use ch.is_alphanumeric() here because that passes for
+                // things that are "numeric" but not ASCII digits, like `¾`
+                if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
+                    full_string.push(ch);
+                } else {
+                    next_char = Some(ch);

-            break;
+                    break;
+                }
+
+                state = state.advance_without_indenting(bytes_parsed)?;
+            }
+            Err(reason) => return state.fail(reason),
        }
    }

-    let chars_parsed = full_string.len();
-
    Ok((
        (Ident::Malformed(full_string.into_bump_str()), next_char),
-        state.advance_without_indenting(chars_parsed)?,
+        state,
    ))
 }

 pub fn ident<'a>() -> impl Parser<'a, Ident<'a>> {
    move |arena: &'a Bump, state: State<'a>| {
        // Discard next_char; we don't need it.
-        let ((string, _), state) = parse_ident(arena, &mut state.input.chars(), state)?;
+        let ((string, _), state) = parse_ident(arena, state)?;

        Ok((string, state))
    }
@ -344,52 +318,47 @@ pub fn global_tag_or_ident<'a, F>(pred: F) -> impl Parser<'a, &'a str>
 where
    F: Fn(char) -> bool,
 {
-    move |arena, state: State<'a>| {
-        let mut chars = state.input.chars();
-
+    move |arena, mut state: State<'a>| {
        // pred will determine if this is a tag or ident (based on capitalization)
-        let first_letter = match chars.next() {
-            Some(first_char) => {
-                if pred(first_char) {
-                    first_char
-                } else {
-                    return Err(unexpected(
-                        first_char,
-                        0,
-                        state,
-                        Attempting::RecordFieldLabel,
-                    ));
+        let (first_letter, bytes_parsed) = match peek_utf8_char(&state) {
+            Ok((first_letter, bytes_parsed)) => {
+                if !pred(first_letter) {
+                    return Err(unexpected(0, state, Attempting::RecordFieldLabel));
                }
+
+                (first_letter, bytes_parsed)
            }
-            None => {
-                return Err(unexpected_eof(0, Attempting::RecordFieldLabel, state));
-            }
+            Err(reason) => return state.fail(reason),
        };

        let mut buf = String::with_capacity_in(1, arena);

        buf.push(first_letter);

-        for ch in chars {
-            // After the first character, only these are allowed:
-            //
-            // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
-            // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
-            // * A ':' indicating the end of the field
-            if ch.is_alphabetic() || ch.is_ascii_digit() {
-                buf.push(ch);
-            } else {
-                // This is the end of the field. We're done!
-                break;
-            }
+        state = state.advance_without_indenting(bytes_parsed)?;
+
+        while !state.bytes.is_empty() {
+            match peek_utf8_char(&state) {
+                Ok((ch, bytes_parsed)) => {
+                    // After the first character, only these are allowed:
+                    //
+                    // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
+                    // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                    // * A ':' indicating the end of the field
+                    if ch.is_alphabetic() || ch.is_ascii_digit() {
+                        buf.push(ch);
+
+                        state = state.advance_without_indenting(bytes_parsed)?;
+                    } else {
+                        // This is the end of the field. We're done!
+                        break;
+                    }
+                }
+                Err(reason) => return state.fail(reason),
+            };
        }

-        let chars_parsed = buf.len();
-
-        Ok((
-            buf.into_bump_str(),
-            state.advance_without_indenting(chars_parsed)?,
-        ))
+        Ok((buf.into_bump_str(), state))
    }
 }

--- a/compiler/parse/src/module.rs
+++ b/compiler/parse/src/module.rs
@ -6,7 +6,10 @@ use crate::blankspace::{space0_around, space1};
 use crate::expr::def;
 use crate::header::ModuleName;
 use crate::ident::unqualified_ident;
-use crate::parser::{self, char, loc, optional, string, unexpected, unexpected_eof, Parser, State};
+use crate::parser::{
+    self, ascii_char, ascii_string, loc, optional, peek_utf8_char, peek_utf8_char_at, unexpected,
+    Parser, State,
+};
 use bumpalo::collections::{String, Vec};
 use roc_region::all::Located;

@ -30,7 +33,10 @@ pub fn app_module<'a>() -> impl Parser<'a, Module<'a>> {
 pub fn interface_header<'a>() -> impl Parser<'a, InterfaceHeader<'a>> {
    parser::map(
        and!(
-            skip_first!(string("interface"), and!(space1(1), loc!(module_name()))),
+            skip_first!(
+                ascii_string("interface"),
+                and!(space1(1), loc!(module_name()))
+            ),
            and!(exposes(), imports())
        ),
        |(
@ -56,72 +62,68 @@ pub fn interface_header<'a>() -> impl Parser<'a, InterfaceHeader<'a>> {

 #[inline(always)]
 pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
-    move |arena, state: State<'a>| {
-        let mut chars = state.input.chars();
+    move |arena, mut state: State<'a>| {
+        match peek_utf8_char(&state) {
+            Ok((first_letter, bytes_parsed)) => {
+                if !first_letter.is_uppercase() {
+                    return Err(unexpected(0, state, Attempting::Module));
+                };

-        let first_letter = match chars.next() {
-            Some(first_char) => {
-                // Module names must all be uppercase
-                if first_char.is_uppercase() {
-                    first_char
-                } else {
-                    return Err(unexpected(
-                        first_char,
-                        0,
-                        state,
-                        Attempting::RecordFieldLabel,
-                    ));
-                }
-            }
-            None => {
-                return Err(unexpected_eof(0, Attempting::Identifier, state));
-            }
-        };
+                let mut buf = String::with_capacity_in(4, arena);

-        let mut buf = String::with_capacity_in(1, arena);
+                buf.push(first_letter);

-        buf.push(first_letter);
+                state = state.advance_without_indenting(bytes_parsed)?;

-        while let Some(ch) = chars.next() {
-            // After the first character, only these are allowed:
-            //
-            // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
-            // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
-            // * A '.' separating module parts
-            if ch.is_alphabetic() || ch.is_ascii_digit() {
-                buf.push(ch);
-            } else if ch == '.' {
-                match chars.next() {
-                    Some(next) => {
-                        if next.is_uppercase() {
-                            // If we hit another uppercase letter, keep going!
-                            buf.push('.');
-                            buf.push(next);
-                        } else {
-                            // We have finished parsing the module name.
+                while !state.bytes.is_empty() {
+                    match peek_utf8_char(&state) {
+                        Ok((ch, bytes_parsed)) => {
+                            // After the first character, only these are allowed:
                            //
-                            // There may be an identifier after this '.',
-                            // e.g. "baz" in `Foo.Bar.baz`
-                            break;
+                            // * Unicode alphabetic chars - you might include `鹏` if that's clear to your readers
+                            // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                            // * A '.' separating module parts
+                            if ch.is_alphabetic() || ch.is_ascii_digit() {
+                                state = state.advance_without_indenting(bytes_parsed)?;
+
+                                buf.push(ch);
+                            } else if ch == '.' {
+                                match peek_utf8_char_at(&state, 1) {
+                                    Ok((next, next_bytes_parsed)) => {
+                                        if next.is_uppercase() {
+                                            // If we hit another uppercase letter, keep going!
+                                            buf.push('.');
+                                            buf.push(next);
+
+                                            state = state.advance_without_indenting(
+                                                bytes_parsed + next_bytes_parsed,
+                                            )?;
+                                        } else {
+                                            // We have finished parsing the module name.
+                                            //
+                                            // There may be an identifier after this '.',
+                                            // e.g. "baz" in `Foo.Bar.baz`
+                                            return Ok((
+                                                ModuleName::new(buf.into_bump_str()),
+                                                state,
+                                            ));
+                                        }
+                                    }
+                                    Err(reason) => return state.fail(reason),
+                                }
+                            } else {
+                                // This is the end of the module name. We're done!
+                                break;
+                            }
                        }
-                    }
-                    None => {
-                        // A module name can't end with a '.'
-                        return Err(unexpected_eof(0, Attempting::Identifier, state));
+                        Err(reason) => return state.fail(reason),
                    }
                }
-            } else {
-                // This is the end of the module name. We're done!
-                break;
+
+                Ok((ModuleName::new(buf.into_bump_str()), state))
            }
+            Err(reason) => state.fail(reason),
        }
-
-        let chars_parsed = buf.len();
-
-        Ok((
-            ModuleName::new(buf.into_bump_str()),
-            state.advance_without_indenting(chars_parsed)?,
-        ))
    }
 }

@ -129,7 +131,7 @@ pub fn module_name<'a>() -> impl Parser<'a, ModuleName<'a>> {
 fn app_header<'a>() -> impl Parser<'a, AppHeader<'a>> {
    parser::map(
        and!(
-            skip_first!(string("app"), and!(space1(1), loc!(module_name()))),
+            skip_first!(ascii_string("app"), and!(space1(1), loc!(module_name()))),
            and!(provides(), imports())
        ),
        |(
@ -167,8 +169,14 @@ fn provides<'a>() -> impl Parser<
    ),
 > {
    and!(
-        and!(skip_second!(space1(1), string("provides")), space1(1)),
-        collection!(char('['), loc!(exposes_entry()), char(','), char(']'), 1)
+        and!(skip_second!(space1(1), ascii_string("provides")), space1(1)),
+        collection!(
+            ascii_char('['),
+            loc!(exposes_entry()),
+            ascii_char(','),
+            ascii_char(']'),
+            1
+        )
    )
 }

@ -181,8 +189,14 @@ fn exposes<'a>() -> impl Parser<
    ),
 > {
    and!(
-        and!(skip_second!(space1(1), string("exposes")), space1(1)),
-        collection!(char('['), loc!(exposes_entry()), char(','), char(']'), 1)
+        and!(skip_second!(space1(1), ascii_string("exposes")), space1(1)),
+        collection!(
+            ascii_char('['),
+            loc!(exposes_entry()),
+            ascii_char(','),
+            ascii_char(']'),
+            1
+        )
    )
 }

@ -195,8 +209,14 @@ fn imports<'a>() -> impl Parser<
    ),
 > {
    and!(
-        and!(skip_second!(space1(1), string("imports")), space1(1)),
-        collection!(char('['), loc!(imports_entry()), char(','), char(']'), 1)
+        and!(skip_second!(space1(1), ascii_string("imports")), space1(1)),
+        collection!(
+            ascii_char('['),
+            loc!(imports_entry()),
+            ascii_char(','),
+            ascii_char(']'),
+            1
+        )
    )
 }

@ -213,8 +233,14 @@ fn imports_entry<'a>() -> impl Parser<'a, ImportsEntry<'a>> {
            module_name(),
            // e.g. `.{ Task, after}`
            optional(skip_first!(
-                char('.'),
-                collection!(char('{'), loc!(exposes_entry()), char(','), char('}'), 1)
+                ascii_char('.'),
+                collection!(
+                    ascii_char('{'),
+                    loc!(exposes_entry()),
+                    ascii_char(','),
+                    ascii_char('}'),
+                    1
+                )
            ))
        ),
        |arena,
--- a/compiler/parse/src/number_literal.rs
+++ b/compiler/parse/src/number_literal.rs
@ -1,23 +1,19 @@
 use crate::ast::{Attempting, Base, Expr};
-use crate::parser::{unexpected, unexpected_eof, ParseResult, Parser, State};
+use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State};
 use std::char;
+use std::str::from_utf8_unchecked;

 pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
    move |_arena, state: State<'a>| {
-        let mut chars = state.input.chars();
+        let bytes = &mut state.bytes.iter();

-        match chars.next() {
-            Some(first_ch) => {
+        match bytes.next() {
+            Some(&first_byte) => {
                // Number literals must start with either an '-' or a digit.
-                if first_ch == '-' || first_ch.is_ascii_digit() {
-                    parse_number_literal(first_ch, &mut chars, state)
+                if first_byte == b'-' || (first_byte as char).is_ascii_digit() {
+                    parse_number_literal(first_byte as char, bytes, state)
                } else {
-                    Err(unexpected(
-                        first_ch,
-                        first_ch.len_utf8(),
-                        state,
-                        Attempting::NumberLiteral,
-                    ))
+                    Err(unexpected(1, state, Attempting::NumberLiteral))
                }
            }
            None => Err(unexpected_eof(0, state.attempting, state)),
@ -28,11 +24,11 @@ pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
 #[inline(always)]
 fn parse_number_literal<'a, I>(
    first_ch: char,
-    chars: &mut I,
+    bytes: &mut I,
    state: State<'a>,
 ) -> ParseResult<'a, Expr<'a>>
 where
-    I: Iterator<Item = char>,
+    I: Iterator<Item = &'a u8>,
 {
    use self::LiteralType::*;

@ -40,13 +36,12 @@ where

    // We already parsed 1 character (which may have been a minus sign).
    let mut bytes_parsed = 1;
-    let mut prev_ch = first_ch;
+    let mut prev_byte = first_ch as u8;
    let mut has_parsed_digits = first_ch.is_ascii_digit();

-    for next_ch in chars {
+    for &next_byte in bytes {
        let err_unexpected = || {
            Err(unexpected(
-                next_ch,
                bytes_parsed,
                state.clone(),
                Attempting::NumberLiteral,
@ -55,91 +50,91 @@ where

        let is_potentially_non_base10 = || {
            (bytes_parsed == 1 && first_ch == '0')
-                || (bytes_parsed == 2 && first_ch == '-' && prev_ch == '0')
+                || (bytes_parsed == 2 && first_ch == '-' && prev_byte == b'0')
        };

-        if next_ch == '.' {
-            if typ == Float {
-                // You only get one decimal point!
-                return err_unexpected();
-            } else {
-                typ = Float;
+        match next_byte as char {
+            '.' => {
+                if typ == Float {
+                    // You only get one decimal point!
+                    return err_unexpected();
+                } else {
+                    typ = Float;
+                }
            }
-        } else if next_ch == 'x' {
-            if is_potentially_non_base10() {
-                typ = Hex;
-            } else {
-                return err_unexpected();
+            'x' => {
+                if is_potentially_non_base10() {
+                    typ = Hex;
+                } else {
+                    return err_unexpected();
+                }
            }
-        } else if next_ch == 'b' && typ == Num {
-            // We have to check for typ == Num because otherwise we get a false
-            // positive here when parsing a hex literal that happens to have
-            // a 'b' in it, e.g. 0xbbbb
-            if is_potentially_non_base10() {
-                typ = Binary;
-            } else {
-                return err_unexpected();
+            'b' if typ == Num => {
+                // We have to check for typ == Num because otherwise we get a false
+                // positive here when parsing a hex literal that happens to have
+                // a 'b' in it, e.g. 0xbbbb
+                if is_potentially_non_base10() {
+                    typ = Binary;
+                } else {
+                    return err_unexpected();
+                }
            }
-        } else if next_ch == 'o' {
-            if is_potentially_non_base10() {
-                typ = Octal;
-            } else {
-                return err_unexpected();
+            'o' => {
+                if is_potentially_non_base10() {
+                    typ = Octal;
+                } else {
+                    return err_unexpected();
+                }
            }
-        } else if next_ch.is_ascii_digit() {
-            has_parsed_digits = true;
-        } else if next_ch != '_' &&
+            next_ch if next_ch.is_ascii_digit() => {
+                has_parsed_digits = true;
+            }
+            next_ch
+                if next_ch != '_' &&
            // ASCII alphabetic chars (like 'a' and 'f') are allowed in Hex int literals.
            // We parse them in any int literal, so we can give a more helpful error
            // in canonicalization (e.g. "the character 'f' is not allowed in Octal literals"
            // or "the character 'g' is outside the range of valid Hex literals")
-            !next_ch.is_ascii_alphabetic()
-        {
-            if has_parsed_digits {
-                // We hit an invalid number literal character; we're done!
-                break;
-            } else {
-                // No digits! We likely parsed a minus sign that's actually an operator.
-                return err_unexpected();
+            !next_ch.is_ascii_alphabetic() =>
+            {
+                if has_parsed_digits {
+                    // We hit an invalid number literal character; we're done!
+                    break;
+                } else {
+                    // No digits! We likely parsed a minus sign that's actually an operator.
+                    return err_unexpected();
+                }
            }
+            _ => {}
        }

        // Since we only consume characters in the ASCII range for number literals,
        // this will always be exactly 1. There's no need to call next_ch.utf8_len().
        bytes_parsed += 1;
-        prev_ch = next_ch;
+        prev_byte = next_byte;
    }

-    let from_base = |base| {
-        let is_negative = first_ch == '-';
-        let string = if is_negative {
-            &state.input[3..bytes_parsed]
-        } else {
-            &state.input[2..bytes_parsed]
-        };
-
-        Expr::NonBase10Int {
-            is_negative,
-            string,
-            base,
-        }
-    };
-
    // At this point we have a number, and will definitely succeed.
    // If the number is malformed (outside the supported range),
    // we'll succeed with an appropriate Expr which records that.
-    let expr = match typ {
-        Num => Expr::Num(&state.input[0..bytes_parsed]),
-        Float => Expr::Float(&state.input[0..bytes_parsed]),
+    match typ {
+        Num => Ok((
+            // SAFETY: it's safe to use from_utf8_unchecked here, because we've
+            // already validated that this range contains only ASCII digits
+            Expr::Num(unsafe { from_utf8_unchecked(&state.bytes[0..bytes_parsed]) }),
+            state.advance_without_indenting(bytes_parsed)?,
+        )),
+        Float => Ok((
+            // SAFETY: it's safe to use from_utf8_unchecked here, because we've
+            // already validated that this range contains only ASCII digits
+            Expr::Float(unsafe { from_utf8_unchecked(&state.bytes[0..bytes_parsed]) }),
+            state.advance_without_indenting(bytes_parsed)?,
+        )),
        // For these we trim off the 0x/0o/0b part
-        Hex => from_base(Base::Hex),
-        Octal => from_base(Base::Octal),
-        Binary => from_base(Base::Binary),
-    };
-
-    let next_state = state.advance_without_indenting(bytes_parsed)?;
-
-    Ok((expr, next_state))
+        Hex => from_base(Base::Hex, first_ch, bytes_parsed, state),
+        Octal => from_base(Base::Octal, first_ch, bytes_parsed, state),
+        Binary => from_base(Base::Binary, first_ch, bytes_parsed, state),
+    }
 }

 #[derive(Debug, PartialEq, Eq)]
@ -150,3 +145,29 @@ enum LiteralType {
    Octal,
    Binary,
 }
+
+fn from_base(
+    base: Base,
+    first_ch: char,
+    bytes_parsed: usize,
+    state: State<'_>,
+) -> ParseResult<'_, Expr<'_>> {
+    let is_negative = first_ch == '-';
+    let bytes = if is_negative {
+        &state.bytes[3..bytes_parsed]
+    } else {
+        &state.bytes[2..bytes_parsed]
+    };
+
+    match parse_utf8(bytes) {
+        Ok(string) => Ok((
+            Expr::NonBase10Int {
+                is_negative,
+                string,
+                base,
+            },
+            state.advance_without_indenting(bytes_parsed)?,
+        )),
+        Err(reason) => state.fail(reason),
+    }
+}
--- a/compiler/parse/src/parser.rs
+++ b/compiler/parse/src/parser.rs
@ -1,14 +1,17 @@
 use crate::ast::Attempting;
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
+use encode_unicode::CharExt;
 use roc_region::all::{Located, Region};
+use std::fmt;
+use std::str::from_utf8;
 use std::{char, u16};

 /// A position in a source file.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Clone, PartialEq, Eq)]
 pub struct State<'a> {
-    /// The raw input string.
-    pub input: &'a str,
+    /// The raw input bytes from the file.
+    pub bytes: &'a [u8],

    /// Current line of the input
    pub line: u32,
@ -39,15 +42,15 @@ pub enum Either<First, Second> {
 }

 impl<'a> State<'a> {
-    pub fn new(input: &'a str, attempting: Attempting) -> State<'a> {
+    pub fn new(bytes: &'a [u8], attempting: Attempting) -> State<'a> {
        State {
-            input,
+            bytes,
            line: 0,
            column: 0,
            indent_col: 0,
            is_indenting: true,
            attempting,
-            original_len: input.len(),
+            original_len: bytes.len(),
        }
    }

@ -69,7 +72,7 @@ impl<'a> State<'a> {
    ///
    /// So if the parser has consumed 8 bytes, this function will return 8.
    pub fn bytes_consumed(&self) -> usize {
-        self.original_len - self.input.len()
+        self.original_len - self.bytes.len()
    }

    /// Increments the line, then resets column, indent_col, and is_indenting.
@ -77,7 +80,7 @@ impl<'a> State<'a> {
    pub fn newline(&self) -> Result<Self, (Fail, Self)> {
        match self.line.checked_add(1) {
            Some(line) => Ok(State {
-                input: &self.input[1..],
+                bytes: &self.bytes[1..],
                line,
                column: 0,
                indent_col: 0,
@ -99,11 +102,11 @@ impl<'a> State<'a> {
    /// This assumes we are *not* advancing with spaces, or at least that
    /// any spaces on the line were preceded by non-spaces - which would mean
    /// they weren't eligible to indent anyway.
-    pub fn advance_without_indenting(&self, quantity: usize) -> Result<Self, (Fail, Self)> {
+    pub fn advance_without_indenting(self, quantity: usize) -> Result<Self, (Fail, Self)> {
        match (self.column as usize).checked_add(quantity) {
            Some(column_usize) if column_usize <= u16::MAX as usize => {
                Ok(State {
-                    input: &self.input[quantity..],
+                    bytes: &self.bytes[quantity..],
                    line: self.line,
                    column: column_usize as u16,
                    indent_col: self.indent_col,
@ -141,7 +144,7 @@ impl<'a> State<'a> {
                };

                Ok(State {
-                    input: &self.input[spaces..],
+                    bytes: &self.bytes[spaces..],
                    line: self.line,
                    column: column_usize as u16,
                    indent_col,
@ -169,6 +172,35 @@ impl<'a> State<'a> {
            end_line: self.line,
        }
    }
+
+    /// Return a failing ParseResult for the given FailReason
+    pub fn fail<T>(self, reason: FailReason) -> Result<(T, Self), (Fail, Self)> {
+        Err((
+            Fail {
+                reason,
+                attempting: self.attempting,
+            },
+            self,
+        ))
+    }
+}
+
+impl<'a> fmt::Debug for State<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "State {{")?;
+
+        match from_utf8(self.bytes) {
+            Ok(string) => write!(f, "\n\tbytes: [utf8] {:?}", string)?,
+            Err(_) => write!(f, "\n\tbytes: [invalid utf8] {:?}", self.bytes)?,
+        }
+
+        write!(f, "\n\t(line, col): ({}, {}),", self.line, self.column)?;
+        write!(f, "\n\tindent_col: {}", self.indent_col)?;
+        write!(f, "\n\tis_indenting: {:?}", self.is_indenting)?;
+        write!(f, "\n\tattempting: {:?}", self.attempting)?;
+        write!(f, "\n\toriginal_len: {}", self.original_len)?;
+        write!(f, "\n}}")
+    }
 }

 #[test]
@ -182,13 +214,14 @@ pub type ParseResult<'a, Output> = Result<(Output, State<'a>), (Fail, State<'a>)

 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum FailReason {
-    Unexpected(char, Region),
+    Unexpected(Region),
    OutdentedTooFar,
    ConditionFailed,
    LineTooLong(u32 /* which line was too long */),
    TooManyLines,
    Eof(Region),
    InvalidPattern,
+    BadUtf8,
    ReservedKeyword(Region),
    ArgumentsBeforeEquals(Region),
 }
@ -332,13 +365,12 @@ pub fn unexpected_eof(
 }

 pub fn unexpected(
-    ch: char,
    chars_consumed: usize,
    state: State<'_>,
    attempting: Attempting,
 ) -> (Fail, State<'_>) {
    checked_unexpected(chars_consumed, state, |region| Fail {
-        reason: FailReason::Unexpected(ch, region),
+        reason: FailReason::Unexpected(region),
        attempting,
    })
 }
@ -385,9 +417,9 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
    // (for example) the LineTooLong initially occurs in the middle of
    // a one_of chain, which would otherwise prevent it from propagating.
    let column = u16::MAX;
-    let input = state.input.get(0..state.input.len()).unwrap();
+    let bytes = state.bytes.get(0..state.bytes.len()).unwrap();
    let state = State {
-        input,
+        bytes,
        line: state.line,
        indent_col: state.indent_col,
        is_indenting: state.is_indenting,
@ -399,29 +431,90 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
    (fail, state)
 }

-/// A single char.
-pub fn char<'a>(expected: char) -> impl Parser<'a, ()> {
-    move |_arena, state: State<'a>| match state.input.chars().next() {
-        Some(actual) if expected == actual => Ok(((), state.advance_without_indenting(1)?)),
-        Some(other_ch) => Err(unexpected(other_ch, 0, state, Attempting::Keyword)),
+/// A single ASCII char.
+pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
+    // Make sure this really is an ASCII char!
+    debug_assert!(expected.len_utf8() == 1);
+
+    move |_arena, state: State<'a>| match state.bytes.first() {
+        Some(&actual) if expected == actual as char => {
+            Ok(((), state.advance_without_indenting(1)?))
+        }
+        Some(_) => Err(unexpected(0, state, Attempting::Keyword)),
        _ => Err(unexpected_eof(0, Attempting::Keyword, state)),
    }
 }

-/// A hardcoded keyword string with no newlines in it.
-pub fn string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
-    // We can't have newlines because we don't attempt to advance the row
-    // in the state, only the column.
-    debug_assert!(!keyword.contains('\n'));
+/// A single UTF-8-encoded char. This will both parse *and* validate that the
+/// char is valid UTF-8.
+pub fn utf8_char2<'a>() -> impl Parser<'a, char> {
+    move |_arena, state: State<'a>| {
+        if !state.bytes.is_empty() {
+            match char::from_utf8_slice_start(state.bytes) {
+                Ok((ch, bytes_parsed)) => Ok((ch, state.advance_without_indenting(bytes_parsed)?)),
+                Err(_) => state.fail(FailReason::BadUtf8),
+            }
+        } else {
+            Err(unexpected_eof(0, state.attempting, state))
+        }
+    }
+}
+
+/// A single UTF-8-encoded char. This will both parse *and* validate that the
+/// char is valid UTF-8, but it will *not* advance the state.
+pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
+    if !state.bytes.is_empty() {
+        match char::from_utf8_slice_start(state.bytes) {
+            Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
+            Err(_) => Err(FailReason::BadUtf8),
+        }
+    } else {
+        Err(FailReason::Eof(
+            Region::zero(), /* TODO get a better region */
+        ))
+    }
+}
+
+/// A single UTF-8-encoded char, with an offset. This will both parse *and*
+/// validate that the char is valid UTF-8, but it will *not* advance the state.
+pub fn peek_utf8_char_at<'a>(
+    state: &State<'a>,
+    offset: usize,
+) -> Result<(char, usize), FailReason> {
+    if state.bytes.len() > offset {
+        let bytes = &state.bytes[offset..];
+
+        match char::from_utf8_slice_start(bytes) {
+            Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
+            Err(_) => Err(FailReason::BadUtf8),
+        }
+    } else {
+        Err(FailReason::Eof(
+            Region::zero(), /* TODO get a better region */
+        ))
+    }
+}
+
+/// A hardcoded string with no newlines, consisting only of ASCII characters
+pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
+    // Verify that this really is exclusively ASCII characters.
+    // The `unsafe` block in this function relies upon this assumption!
+    //
+    // Also, this can't have newlines because we don't attempt to advance
+    // the row in the state, only the column.
+    debug_assert!(keyword.chars().all(|ch| ch.len_utf8() == 1 && ch != '\n'));

    move |_arena, state: State<'a>| {
-        let input = state.input;
        let len = keyword.len();

        // TODO do this comparison in one SIMD instruction (on supported systems)
-        match input.get(0..len) {
-            Some(next_str) if next_str == keyword => {
-                Ok(((), state.advance_without_indenting(len)?))
+        match state.bytes.get(0..len) {
+            Some(next_str) => {
+                if next_str == keyword.as_bytes() {
+                    Ok(((), state.advance_without_indenting(len)?))
+                } else {
+                    Err(unexpected(len, state, Attempting::Keyword))
+                }
            }
            _ => Err(unexpected_eof(0, Attempting::Keyword, state)),
        }
@ -686,7 +779,7 @@ macro_rules! collection {
                // We could change the AST to add extra storage specifically to
                // support empty literals containing newlines or comments, but this
                // does not seem worth even the tiniest regression in compiler performance.
-                zero_or_more!($crate::parser::char(' ')),
+                zero_or_more!($crate::parser::ascii_char(' ')),
                skip_second!(
                    $crate::parser::sep_by0(
                        $delimiter,
@ -912,6 +1005,7 @@ macro_rules! record_field {
            use $crate::ast::AssignedField::*;
            use $crate::blankspace::{space0, space0_before};
            use $crate::ident::lowercase_ident;
+            use $crate::parser::ascii_char;
            use $crate::parser::Either::*;

            // You must have a field name, e.g. "email"
@ -922,8 +1016,8 @@ macro_rules! record_field {
            // Having a value is optional; both `{ email }` and `{ email: blah }` work.
            // (This is true in both literals and types.)
            let (opt_loc_val, state) = $crate::parser::optional(either!(
-                skip_first!(char(':'), space0_before($val_parser, $min_indent)),
-                skip_first!(char('?'), space0_before($val_parser, $min_indent))
+                skip_first!(ascii_char(':'), space0_before($val_parser, $min_indent)),
+                skip_first!(ascii_char('?'), space0_before($val_parser, $min_indent))
            ))
            .parse(arena, state)?;

@ -952,10 +1046,10 @@ macro_rules! record_field {
 macro_rules! record_without_update {
    ($val_parser:expr, $min_indent:expr) => {
        collection!(
-            char('{'),
+            ascii_char('{'),
            loc!(record_field!($val_parser, $min_indent)),
-            char(','),
-            char('}'),
+            ascii_char(','),
+            ascii_char('}'),
            $min_indent
        )
    };
@ -965,7 +1059,7 @@ macro_rules! record_without_update {
 macro_rules! record {
    ($val_parser:expr, $min_indent:expr) => {
        skip_first!(
-            $crate::parser::char('{'),
+            $crate::parser::ascii_char('{'),
            and!(
                // You can optionally have an identifier followed by an '&' to
                // make this a record update, e.g. { Foo.user & username: "blah" }.
@ -981,7 +1075,7 @@ macro_rules! record {
                        )),
                        $min_indent
                    ),
-                    $crate::parser::char('&')
+                    $crate::parser::ascii_char('&')
                )),
                loc!(skip_first!(
                    // We specifically allow space characters inside here, so that
@ -995,16 +1089,16 @@ macro_rules! record {
                    // We could change the AST to add extra storage specifically to
                    // support empty literals containing newlines or comments, but this
                    // does not seem worth even the tiniest regression in compiler performance.
-                    zero_or_more!($crate::parser::char(' ')),
+                    zero_or_more!($crate::parser::ascii_char(' ')),
                    skip_second!(
                        $crate::parser::sep_by0(
-                            $crate::parser::char(','),
+                            $crate::parser::ascii_char(','),
                            $crate::blankspace::space0_around(
                                loc!(record_field!($val_parser, $min_indent)),
                                $min_indent
                            )
                        ),
-                        $crate::parser::char('}')
+                        $crate::parser::ascii_char('}')
                    )
                ))
            )
@ -1067,3 +1161,10 @@ where
 {
    attempt!(attempting, parser)
 }
+
+pub fn parse_utf8(bytes: &[u8]) -> Result<&str, FailReason> {
+    match from_utf8(bytes) {
+        Ok(string) => Ok(string),
+        Err(_) => Err(FailReason::BadUtf8),
+    }
+}
--- a/compiler/parse/src/string_literal.rs
+++ b/compiler/parse/src/string_literal.rs
@ -1,8 +1,7 @@
 use crate::ast::Attempting;
-use crate::parser::{unexpected, unexpected_eof, ParseResult, Parser, State};
+use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State};
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
-use std::char;

 pub enum StringLiteral<'a> {
    Line(&'a str),
@ -11,14 +10,15 @@ pub enum StringLiteral<'a> {

 pub fn parse<'a>() -> impl Parser<'a, StringLiteral<'a>> {
    move |arena: &'a Bump, state: State<'a>| {
-        let mut chars = state.input.chars();
+        let mut bytes = state.bytes.iter();

        // String literals must start with a quote.
        // If this doesn't, it must not be a string literal!
-        match chars.next() {
-            Some('"') => (),
-            Some(other_char) => {
-                return Err(unexpected(other_char, 0, state, Attempting::StringLiteral));
+        match bytes.next() {
+            Some(&byte) => {
+                if byte != b'"' {
+                    return Err(unexpected(0, state, Attempting::StringLiteral));
+                }
            }
            None => {
                return Err(unexpected_eof(0, Attempting::StringLiteral, state));
@ -35,44 +35,49 @@ pub fn parse<'a>() -> impl Parser<'a, StringLiteral<'a>> {
        // Since we're keeping the entire raw string, all we need to track is
        // how many characters we've parsed. So far, that's 1 (the opening `"`).
        let mut parsed_chars = 1;
-        let mut prev_ch = '"';
+        let mut prev_byte = b'"';

-        while let Some(ch) = chars.next() {
+        while let Some(&byte) = bytes.next() {
            parsed_chars += 1;

            // Potentially end the string (unless this is an escaped `"`!)
-            if ch == '"' && prev_ch != '\\' {
-                let string = if parsed_chars == 2 {
-                    if let Some('"') = chars.next() {
-                        // If the first three chars were all `"`, then this
-                        // literal begins with `"""` and is a block string.
-                        return parse_block_string(arena, state, &mut chars);
-                    } else {
-                        ""
+            if byte == b'"' && prev_byte != b'\\' {
+                let (string, state) = if parsed_chars == 2 {
+                    match bytes.next() {
+                        Some(byte) if *byte == b'"' => {
+                            // If the first three chars were all `"`, then this
+                            // literal begins with `"""` and is a block string.
+                            return parse_block_string(arena, state, &mut bytes);
+                        }
+                        _ => ("", state.advance_without_indenting(2)?),
                    }
                } else {
                    // Start at 1 so we omit the opening `"`.
                    // Subtract 1 from parsed_chars so we omit the closing `"`.
-                    &state.input[1..(parsed_chars - 1)]
+                    let string_bytes = &state.bytes[1..(parsed_chars - 1)];
+
+                    match parse_utf8(string_bytes) {
+                        Ok(string) => (string, state.advance_without_indenting(parsed_chars)?),
+                        Err(reason) => {
+                            return state.fail(reason);
+                        }
+                    }
                };

-                let next_state = state.advance_without_indenting(parsed_chars)?;
-
-                return Ok((StringLiteral::Line(string), next_state));
-            } else if ch == '\n' {
+                return Ok((StringLiteral::Line(string), state));
+            } else if byte == b'\n' {
                // This is a single-line string, which cannot have newlines!
                // Treat this as an unclosed string literal, and consume
                // all remaining chars. This will mask all other errors, but
                // it should make it easiest to debug; the file will be a giant
                // error starting from where the open quote appeared.
                return Err(unexpected(
-                    '\n',
-                    state.input.len() - 1,
+                    state.bytes.len() - 1,
                    state,
                    Attempting::StringLiteral,
                ));
            } else {
-                prev_ch = ch;
+                prev_byte = byte;
            }
        }

@ -88,48 +93,64 @@ pub fn parse<'a>() -> impl Parser<'a, StringLiteral<'a>> {
 fn parse_block_string<'a, I>(
    arena: &'a Bump,
    state: State<'a>,
-    chars: &mut I,
+    bytes: &mut I,
 ) -> ParseResult<'a, StringLiteral<'a>>
 where
-    I: Iterator<Item = char>,
+    I: Iterator<Item = &'a u8>,
 {
    // So far we have consumed the `"""` and that's it.
    let mut parsed_chars = 3;
-    let mut prev_ch = '"';
+    let mut prev_byte = b'"';
    let mut quotes_seen = 0;

    // start at 3 to omit the opening `"`.
    let mut line_start = 3;

-    let mut lines = Vec::new_in(arena);
+    let mut lines: Vec<'a, &'a str> = Vec::new_in(arena);

-    for ch in chars {
+    for byte in bytes {
        parsed_chars += 1;

        // Potentially end the string (unless this is an escaped `"`!)
-        if ch == '"' && prev_ch != '\\' {
+        if *byte == b'"' && prev_byte != b'\\' {
            if quotes_seen == 2 {
                // three consecutive qoutes, end string

                // Subtract 3 from parsed_chars so we omit the closing `"`.
-                let string = &state.input[line_start..(parsed_chars - 3)];
-                lines.push(string);
+                let line_bytes = &state.bytes[line_start..(parsed_chars - 3)];

-                let next_state = state.advance_without_indenting(parsed_chars)?;
+                return match parse_utf8(line_bytes) {
+                    Ok(line) => {
+                        let state = state.advance_without_indenting(parsed_chars)?;

-                return Ok((StringLiteral::Block(arena.alloc(lines)), next_state));
+                        lines.push(line);
+
+                        Ok((StringLiteral::Block(arena.alloc(lines)), state))
+                    }
+                    Err(reason) => state.fail(reason),
+                };
            }
            quotes_seen += 1;
-        } else if ch == '\n' {
+        } else if *byte == b'\n' {
            // note this includes the newline
-            let string = &state.input[line_start..parsed_chars];
-            lines.push(string);
-            quotes_seen = 0;
-            line_start = parsed_chars;
+            let line_bytes = &state.bytes[line_start..parsed_chars];
+
+            match parse_utf8(line_bytes) {
+                Ok(line) => {
+                    lines.push(line);
+
+                    quotes_seen = 0;
+                    line_start = parsed_chars;
+                }
+                Err(reason) => {
+                    return state.fail(reason);
+                }
+            }
        } else {
            quotes_seen = 0;
        }
-        prev_ch = ch;
+
+        prev_byte = *byte;
    }

    // We ran out of characters before finding 3 closing quotes
@ -137,6 +158,6 @@ where
        parsed_chars,
        // TODO custom BlockStringLiteral?
        Attempting::StringLiteral,
-        state.clone(),
+        state,
    ))
 }
--- a/compiler/parse/src/type_annotation.rs
+++ b/compiler/parse/src/type_annotation.rs
@ -4,8 +4,8 @@ use crate::expr::{global_tag, private_tag};
 use crate::ident::join_module_parts;
 use crate::keyword;
 use crate::parser::{
-    allocated, char, not, optional, string, unexpected, unexpected_eof, Either, ParseResult,
-    Parser, State,
+    allocated, ascii_char, ascii_string, not, optional, peek_utf8_char, unexpected, Either,
+    ParseResult, Parser, State,
 };
 use bumpalo::collections::string::String;
 use bumpalo::collections::vec::Vec;
@ -22,10 +22,10 @@ macro_rules! tag_union {
        map!(
            and!(
                collection!(
-                    char('['),
+                    ascii_char('['),
                    loc!(tag_type($min_indent)),
-                    char(','),
-                    char(']'),
+                    ascii_char(','),
+                    ascii_char(']'),
                    $min_indent
                ),
                optional(
@ -61,7 +61,7 @@ pub fn term<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>>>
                and!(
                    space1(min_indent),
                    skip_first!(
-                        string(keyword::AS),
+                        ascii_string(keyword::AS),
                        space1_before(term(min_indent), min_indent)
                    )
                )
@ -89,7 +89,7 @@ pub fn term<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>>>

 /// The `*` type variable, e.g. in (List *) Wildcard,
 fn loc_wildcard<'a>() -> impl Parser<'a, Located<TypeAnnotation<'a>>> {
-    map!(loc!(char('*')), |loc_val: Located<()>| {
+    map!(loc!(ascii_char('*')), |loc_val: Located<()>| {
        loc_val.map(|_| TypeAnnotation::Wildcard)
    })
 }
@ -97,7 +97,7 @@ fn loc_wildcard<'a>() -> impl Parser<'a, Located<TypeAnnotation<'a>>> {
 pub fn loc_applied_arg<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>>> {
    skip_first!(
        // Once we hit an "as", stop parsing args
-        not(string(keyword::AS)),
+        not(ascii_string(keyword::AS)),
        one_of!(
            loc_wildcard(),
            loc_parenthetical_type(min_indent),
@ -112,12 +112,12 @@ pub fn loc_applied_arg<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnot
 #[inline(always)]
 fn loc_parenthetical_type<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>>> {
    between!(
-        char('('),
+        ascii_char('('),
        space0_around(
            move |arena, state| expression(min_indent).parse(arena, state),
            min_indent,
        ),
-        char(')')
+        ascii_char(')')
    )
 }

@ -208,7 +208,7 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>
    move |arena, state: State<'a>| {
        let (first, state) = space0_before(term(min_indent), min_indent).parse(arena, state)?;
        let (rest, state) = zero_or_more!(skip_first!(
-            char(','),
+            ascii_char(','),
            space0_around(term(min_indent), min_indent)
        ))
        .parse(arena, state)?;
@ -216,7 +216,7 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>
        // TODO this space0 is dropped, so newlines just before the function arrow when there
        // is only one argument are not seen by the formatter. Can we do better?
        let (is_function, state) =
-            optional(skip_first!(space0(min_indent), string("->"))).parse(arena, state)?;
+            optional(skip_first!(space0(min_indent), ascii_string("->"))).parse(arena, state)?;

        if is_function.is_some() {
            let (return_type, state) =
@ -263,67 +263,70 @@ fn expression<'a>(min_indent: u16) -> impl Parser<'a, Located<TypeAnnotation<'a>

 fn parse_concrete_type<'a>(
    arena: &'a Bump,
-    state: State<'a>,
+    mut state: State<'a>,
 ) -> ParseResult<'a, TypeAnnotation<'a>> {
-    let mut chars = state.input.chars();
    let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
    let mut parts: Vec<&'a str> = Vec::new_in(arena);

    // Qualified types must start with a capitalized letter.
-    match chars.next() {
-        Some(ch) => {
-            if ch.is_alphabetic() && ch.is_uppercase() {
-                part_buf.push(ch);
+    match peek_utf8_char(&state) {
+        Ok((first_letter, bytes_parsed)) => {
+            if first_letter.is_alphabetic() && first_letter.is_uppercase() {
+                part_buf.push(first_letter);
            } else {
-                return Err(unexpected(ch, 0, state, Attempting::ConcreteType));
+                return Err(unexpected(0, state, Attempting::ConcreteType));
            }
-        }
-        None => {
-            return Err(unexpected_eof(0, Attempting::ConcreteType, state));
-        }
-    };

-    let mut chars_parsed = 1;
+            state = state.advance_without_indenting(bytes_parsed)?;
+        }
+        Err(reason) => return state.fail(reason),
+    }
+
    let mut next_char = None;

-    while let Some(ch) = chars.next() {
-        // After the first character, only these are allowed:
-        //
-        // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
-        // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
-        // * A dot ('.')
-        if ch.is_alphabetic() {
-            if part_buf.is_empty() && !ch.is_uppercase() {
-                // Each part must begin with a capital letter.
-                return malformed(Some(ch), arena, state, &mut chars, parts);
+    while !state.bytes.is_empty() {
+        match peek_utf8_char(&state) {
+            Ok((ch, bytes_parsed)) => {
+                // After the first character, only these are allowed:
+                //
+                // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
+                // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                // * A dot ('.')
+                if ch.is_alphabetic() {
+                    if part_buf.is_empty() && !ch.is_uppercase() {
+                        // Each part must begin with a capital letter.
+                        return malformed(Some(ch), arena, state, parts);
+                    }
+
+                    part_buf.push(ch);
+                } else if ch.is_ascii_digit() {
+                    // Parts may not start with numbers!
+                    if part_buf.is_empty() {
+                        return malformed(Some(ch), arena, state, parts);
+                    }
+
+                    part_buf.push(ch);
+                } else if ch == '.' {
+                    // Having two consecutive dots is an error.
+                    if part_buf.is_empty() {
+                        return malformed(Some(ch), arena, state, parts);
+                    }
+
+                    parts.push(part_buf.into_bump_str());
+
+                    // Now that we've recorded the contents of the current buffer, reset it.
+                    part_buf = String::new_in(arena);
+                } else {
+                    // This must be the end of the type. We're done!
+                    next_char = Some(ch);
+
+                    break;
+                }
+
+                state = state.advance_without_indenting(bytes_parsed)?;
            }
-
-            part_buf.push(ch);
-        } else if ch.is_ascii_digit() {
-            // Parts may not start with numbers!
-            if part_buf.is_empty() {
-                return malformed(Some(ch), arena, state, &mut chars, parts);
-            }
-
-            part_buf.push(ch);
-        } else if ch == '.' {
-            // Having two consecutive dots is an error.
-            if part_buf.is_empty() {
-                return malformed(Some(ch), arena, state, &mut chars, parts);
-            }
-
-            parts.push(part_buf.into_bump_str());
-
-            // Now that we've recorded the contents of the current buffer, reset it.
-            part_buf = String::new_in(arena);
-        } else {
-            // This must be the end of the type. We're done!
-            next_char = Some(ch);
-
-            break;
+            Err(reason) => return state.fail(reason),
        }
-
-        chars_parsed += 1;
    }

    if part_buf.is_empty() {
@ -333,23 +336,16 @@ fn parse_concrete_type<'a>(
        //
        // If we made it this far and don't have a next_char, then necessarily
        // we have consumed a '.' char previously.
-        return malformed(
-            next_char.or_else(|| Some('.')),
-            arena,
-            state,
-            &mut chars,
-            parts,
-        );
+        return malformed(next_char.or_else(|| Some('.')), arena, state, parts);
    }

    if part_buf.is_empty() {
        // We had neither capitalized nor noncapitalized parts,
        // yet we made it this far. The only explanation is that this was
        // a stray '.' drifting through the cosmos.
-        return Err(unexpected('.', 1, state, Attempting::Identifier));
+        return Err(unexpected(1, state, Attempting::Identifier));
    }

-    let state = state.advance_without_indenting(chars_parsed)?;
    let answer = TypeAnnotation::Apply(
        join_module_parts(arena, parts.into_bump_slice()),
        part_buf.into_bump_str(),
@ -361,58 +357,55 @@ fn parse_concrete_type<'a>(

 fn parse_type_variable<'a>(
    arena: &'a Bump,
-    state: State<'a>,
+    mut state: State<'a>,
 ) -> ParseResult<'a, TypeAnnotation<'a>> {
-    let mut chars = state.input.chars();
    let mut buf = String::new_in(arena);

-    // Type variables must start with a lowercase letter.
-    match chars.next() {
-        Some(ch) => {
-            if ch.is_alphabetic() && ch.is_lowercase() {
-                buf.push(ch);
+    match peek_utf8_char(&state) {
+        Ok((first_letter, bytes_parsed)) => {
+            // Type variables must start with a lowercase letter.
+            if first_letter.is_alphabetic() && first_letter.is_lowercase() {
+                buf.push(first_letter);
            } else {
-                return Err(unexpected(ch, 0, state, Attempting::TypeVariable));
+                return Err(unexpected(0, state, Attempting::TypeVariable));
            }
-        }
-        None => {
-            return Err(unexpected_eof(0, Attempting::TypeVariable, state));
-        }
-    };

-    let mut chars_parsed = 1;
-
-    for ch in chars {
-        // After the first character, only these are allowed:
-        //
-        // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
-        // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
-        if ch.is_alphabetic() || ch.is_ascii_digit() {
-            buf.push(ch);
-        } else {
-            // This must be the end of the type. We're done!
-            break;
+            state = state.advance_without_indenting(bytes_parsed)?;
+        }
+        Err(reason) => return state.fail(reason),
+    }
+
+    while !state.bytes.is_empty() {
+        match peek_utf8_char(&state) {
+            Ok((ch, bytes_parsed)) => {
+                // After the first character, only these are allowed:
+                //
+                // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
+                // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
+                if ch.is_alphabetic() || ch.is_ascii_digit() {
+                    buf.push(ch);
+                } else {
+                    // This must be the end of the type. We're done!
+                    break;
+                }
+
+                state = state.advance_without_indenting(bytes_parsed)?;
+            }
+            Err(reason) => return state.fail(reason),
        }
-
-        chars_parsed += 1;
    }

-    let state = state.advance_without_indenting(chars_parsed)?;
    let answer = TypeAnnotation::BoundVariable(buf.into_bump_str());

    Ok((answer, state))
 }

-fn malformed<'a, I>(
+fn malformed<'a>(
    opt_bad_char: Option<char>,
    arena: &'a Bump,
-    state: State<'a>,
-    chars: &mut I,
+    mut state: State<'a>,
    parts: Vec<&'a str>,
-) -> ParseResult<'a, TypeAnnotation<'a>>
-where
-    I: Iterator<Item = char>,
-{
+) -> ParseResult<'a, TypeAnnotation<'a>> {
    // Reconstruct the original string that we've been parsing.
    let mut full_string = String::new_in(arena);

@ -423,20 +416,25 @@ where
    }

    // Consume the remaining chars in the identifier.
-    for ch in chars {
-        // We can't use ch.is_alphanumeric() here because that passes for
-        // things that are "numeric" but not ASCII digits, like `¾`
-        if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
-            full_string.push(ch);
-        } else {
-            break;
+    while !state.bytes.is_empty() {
+        match peek_utf8_char(&state) {
+            Ok((ch, bytes_parsed)) => {
+                // We can't use ch.is_alphanumeric() here because that passes for
+                // things that are "numeric" but not ASCII digits, like `¾`
+                if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
+                    full_string.push(ch);
+                } else {
+                    break;
+                }
+
+                state = state.advance_without_indenting(bytes_parsed)?;
+            }
+            Err(reason) => return state.fail(reason),
        }
    }

-    let chars_parsed = full_string.len();
-
    Ok((
        TypeAnnotation::Malformed(full_string.into_bump_str()),
-        state.advance_without_indenting(chars_parsed)?,
+        state,
    ))
 }
--- a/compiler/parse/tests/helpers/mod.rs
+++ b/compiler/parse/tests/helpers/mod.rs
@ -13,7 +13,7 @@ pub fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<ast::Expr<'a>,

 #[allow(dead_code)]
 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

--- a/compiler/parse/tests/test_parse.rs
+++ b/compiler/parse/tests/test_parse.rs
@ -918,17 +918,12 @@ mod test_parse {
        let arena = Bump::new();
        let arg = arena.alloc(Located::new(0, 0, 5, 6, Num("1")));
        let args = bumpalo::vec![in &arena; &*arg];
+        let expr = Var {
+            module_name: "",
+            ident: "whee",
+        };
        let expected = Expr::Apply(
-            arena.alloc(Located::new(
-                0,
-                0,
-                0,
-                4,
-                Var {
-                    module_name: "",
-                    ident: "whee",
-                },
-            )),
+            arena.alloc(Located::new(0, 0, 0, 4, expr)),
            args,
            CalledVia::Space,
        );
@ -1040,16 +1035,11 @@ mod test_parse {
    fn unary_negation() {
        let arena = Bump::new();
        let loc_op = Located::new(0, 0, 0, 1, UnaryOp::Negate);
-        let loc_arg1_expr = Located::new(
-            0,
-            0,
-            1,
-            4,
-            Var {
-                module_name: "",
-                ident: "foo",
-            },
-        );
+        let arg1_expr = Var {
+            module_name: "",
+            ident: "foo",
+        };
+        let loc_arg1_expr = Located::new(0, 0, 1, 4, arg1_expr);
        let expected = UnaryOp(arena.alloc(loc_arg1_expr), loc_op);
        let actual = parse_with(&arena, "-foo");

@ -1060,16 +1050,11 @@ mod test_parse {
    fn unary_not() {
        let arena = Bump::new();
        let loc_op = Located::new(0, 0, 0, 1, UnaryOp::Not);
-        let loc_arg1_expr = Located::new(
-            0,
-            0,
-            1,
-            5,
-            Var {
-                module_name: "",
-                ident: "blah",
-            },
-        );
+        let arg1_expr = Var {
+            module_name: "",
+            ident: "blah",
+        };
+        let loc_arg1_expr = Located::new(0, 0, 1, 5, arg1_expr);
        let expected = UnaryOp(arena.alloc(loc_arg1_expr), loc_op);
        let actual = parse_with(&arena, "!blah");

@ -2092,7 +2077,7 @@ mod test_parse {
            "#
        );
        let actual = interface_header()
-            .parse(&arena, State::new(&src, Attempting::Module))
+            .parse(&arena, State::new(src.as_bytes(), Attempting::Module))
            .map(|tuple| tuple.0);

        assert_eq!(Ok(expected), actual);
@ -2121,7 +2106,7 @@ mod test_parse {
            "#
        );
        let actual = interface_header()
-            .parse(&arena, State::new(&src, Attempting::Module))
+            .parse(&arena, State::new(src.as_bytes(), Attempting::Module))
            .map(|tuple| tuple.0);

        assert_eq!(Ok(expected), actual);
@ -2174,7 +2159,7 @@ mod test_parse {
            "#
        );
        let actual = module_defs()
-            .parse(&arena, State::new(&src, Attempting::Module))
+            .parse(&arena, State::new(src.as_bytes(), Attempting::Module))
            .map(|tuple| tuple.0);

        assert_eq!(Ok(expected), actual);
--- a/compiler/reporting/tests/helpers/mod.rs
+++ b/compiler/reporting/tests/helpers/mod.rs
@ -91,7 +91,7 @@ pub fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<ast::Expr<'a>,

 #[allow(dead_code)]
 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

--- a/compiler/solve/tests/helpers/mod.rs
+++ b/compiler/solve/tests/helpers/mod.rs
@ -93,7 +93,7 @@ pub fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<ast::Expr<'a>,

 #[allow(dead_code)]
 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);

--- a/compiler/uniq/tests/helpers/mod.rs
+++ b/compiler/uniq/tests/helpers/mod.rs
@ -93,7 +93,7 @@ pub fn parse_with<'a>(arena: &'a Bump, input: &'a str) -> Result<ast::Expr<'a>,

 #[allow(dead_code)]
 pub fn parse_loc_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Located<ast::Expr<'a>>, Fail> {
-    let state = State::new(&input, Attempting::Module);
+    let state = State::new(input.as_bytes(), Attempting::Module);
    let parser = space0_before(loc(roc_parse::expr::expr(0)), 0);
    let answer = parser.parse(&arena, state);