From 5576b88604f6519a60b23317783538a308c0b6d8 Mon Sep 17 00:00:00 2001
From: Rui Ueyama <ruiu@cs.stanford.edu>
Date: Fri, 19 Apr 2024 12:25:41 +0900
Subject: [PATCH] Upgrade blake3 in ./third-party to v1.5.1

---
 third-party/blake3/.git-blame-ignore-revs     |    2 +
 third-party/blake3/.github/workflows/ci.yml   |   77 +-
 third-party/blake3/.github/workflows/tag.yml  |    8 +-
 third-party/blake3/Cargo.toml                 |   37 +-
 third-party/blake3/README.md                  |    2 +
 third-party/blake3/b3sum/Cargo.lock           |  482 +++-----
 third-party/blake3/b3sum/Cargo.toml           |    5 +-
 third-party/blake3/b3sum/src/main.rs          |  162 +--
 third-party/blake3/build.rs                   |   22 +-
 third-party/blake3/c/CMakeLists.txt           |   67 +-
 third-party/blake3/c/blake3.c                 |   19 +-
 third-party/blake3/c/blake3.h                 |    2 +-
 .../c/blake3_c_rust_bindings/Cargo.toml       |    6 +-
 .../c/blake3_c_rust_bindings/src/test.rs      |    2 +-
 third-party/blake3/c/blake3_dispatch.c        |   39 +-
 third-party/blake3/c/blake3_impl.h            |    6 +-
 third-party/blake3/c/blake3_neon.c            |    6 +-
 third-party/blake3/rust/guts/Cargo.toml       |   18 +
 third-party/blake3/rust/guts/readme.md        |   80 ++
 third-party/blake3/rust/guts/src/lib.rs       | 1000 +++++++++++++++++
 third-party/blake3/rust/guts/src/portable.rs  |  262 +++++
 third-party/blake3/rust/guts/src/test.rs      |  523 +++++++++
 third-party/blake3/src/io.rs                  |   79 ++
 third-party/blake3/src/lib.rs                 |  304 ++++-
 third-party/blake3/src/platform.rs            |   24 +
 third-party/blake3/src/test.rs                |  208 ++++
 third-party/blake3/tools/release.md           |    2 +-
 27 files changed, 2855 insertions(+), 589 deletions(-)
 create mode 100644 third-party/blake3/.git-blame-ignore-revs
 create mode 100644 third-party/blake3/rust/guts/Cargo.toml
 create mode 100644 third-party/blake3/rust/guts/readme.md
 create mode 100644 third-party/blake3/rust/guts/src/lib.rs
 create mode 100644 third-party/blake3/rust/guts/src/portable.rs
 create mode 100644 third-party/blake3/rust/guts/src/test.rs
 create mode 100644 third-party/blake3/src/io.rs

diff --git a/third-party/blake3/.git-blame-ignore-revs b/third-party/blake3/.git-blame-ignore-revs
new file mode 100644
index 00000000..6e814e69
--- /dev/null
+++ b/third-party/blake3/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# CMakeLists.txt whitespace fixups
+3e14f865d30271c74fc68d417af488ea91b66d48
diff --git a/third-party/blake3/.github/workflows/ci.yml b/third-party/blake3/.github/workflows/ci.yml
index c1a88aaf..e93ecb38 100644
--- a/third-party/blake3/.github/workflows/ci.yml
+++ b/third-party/blake3/.github/workflows/ci.yml
@@ -38,12 +38,10 @@ jobs:
         ]
 
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@master
       with:
         toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
-        profile: minimal
-        override: true
     # Print the compiler version, for debugging.
     - name: print compiler version
       run: cargo run --quiet
@@ -52,13 +50,17 @@ jobs:
     - name: print instruction set support
       run: cargo run --quiet
       working-directory: ./tools/instruction_set_support
-    # Default tests plus Rayon and RustCrypto trait implementations.
-    - run: cargo test --features=rayon,traits-preview
+    # Default tests plus Rayon and trait implementations.
+    - run: cargo test --features=rayon,traits-preview,serde,zeroize
     # Same but with only one thread in the Rayon pool. This can find deadlocks.
     - name: "again with RAYON_NUM_THREADS=1"
-      run: cargo test --features=rayon,traits-preview
+      run: cargo test --features=rayon,traits-preview,serde,zeroize
       env:
         RAYON_NUM_THREADS: 1
+    # The mmap feature by itself (update_mmap_rayon is omitted).
+    - run: cargo test --features=mmap
+    # All public features put together.
+    - run: cargo test --features=mmap,rayon,traits-preview,serde,zeroize
     # no_std tests.
     - run: cargo test --no-default-features
 
@@ -129,6 +131,17 @@ jobs:
       run: cargo test
       working-directory: ./reference_impl
 
+    # the new guts crate
+    - name: guts test
+      run: cargo test --all-features
+      working-directory: ./rust/guts
+    - name: guts no_std build
+      run: cargo build --no-default-features
+      working-directory: ./rust/guts
+    - name: guts no_std test  # note that rust/guts/src/test.rs still uses libstd
+      run: cargo test --no-default-features
+      working-directory: ./rust/guts
+
   b3sum_tests:
     name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }}
     runs-on: ${{ matrix.target.os }}
@@ -148,16 +161,14 @@ jobs:
           # The b3sum MSRV is sometimes higher than the blake3 crate's, because
           # b3sum depends on Clap. We check in the b3sum Cargo.lock, so Clap
           # update shouldn't randomly break us here.
-          "1.66.1",
+          "1.74.1",
         ]
 
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@master
       with:
         toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
-        profile: minimal
-        override: true
     # Test b3sum.
     - name: test b3sum
       run: cargo test
@@ -177,14 +188,13 @@ jobs:
           - i686-unknown-linux-musl
           - armv7-unknown-linux-gnueabihf
           - aarch64-unknown-linux-gnu
-          - mips-unknown-linux-gnu
+          # Big-endian targets. See https://twitter.com/burntsushi5/status/1695483429997945092.
+          - powerpc64-unknown-linux-gnu
+          - s390x-unknown-linux-gnu
 
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
-      with:
-        toolchain: stable
-        override: true
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@stable
     - run: cargo install cross
     # Test the portable implementation on everything.
     - run: cross test --target ${{ matrix.arch }}
@@ -210,7 +220,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     # Test the intrinsics-based implementations.
     - run: make -f Makefile.testing test
       working-directory: ./c
@@ -262,12 +272,10 @@ jobs:
     strategy:
       fail-fast: false
     steps:
-    - uses: actions/checkout@v3
-    - uses: actions-rs/toolchain@v1
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@stable
       with:
-        toolchain: stable
-        target: aarch64-apple-darwin
-        override: true
+        targets: aarch64-apple-darwin
     - name: build blake3
       run: cargo build --target aarch64-apple-darwin
     - name: build b3sum
@@ -278,7 +286,7 @@ jobs:
     name: build with the Tiny C Compiler
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: install TCC
       run: sudo apt-get install -y tcc
     - name: compile
@@ -295,7 +303,7 @@ jobs:
     name: "compile and test with GCC 5.4"
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - uses: addnab/docker-run-action@v3
       with:
         image: gcc:5.4
@@ -308,7 +316,7 @@ jobs:
 
   # CMake build test (Library only), current macOS/Linux only.
   cmake_build:
-    name: CMake ${{ matrix.os }}
+    name: CMake ${{ matrix.os }} ${{ matrix.compiler }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -323,8 +331,21 @@ jobs:
           - os: macOS-latest
             compiler: msvc
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: CMake generation
         run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target
       - name: CMake build / install
         run: cmake --build c/build --target install
+
+  miri_smoketest:
+    name: Miri smoketest
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: dtolnay/rust-toolchain@nightly
+      with:
+        components: miri
+    # Currently the test search "miri" only matches "test_miri_smoketest", but
+    # we might add more. If this accidentally picks up anything incompatible or
+    # slow, we can narrow it.
+    - run: cargo miri test miri
diff --git a/third-party/blake3/.github/workflows/tag.yml b/third-party/blake3/.github/workflows/tag.yml
index 3f7e886b..61be4ff9 100644
--- a/third-party/blake3/.github/workflows/tag.yml
+++ b/third-party/blake3/.github/workflows/tag.yml
@@ -23,18 +23,16 @@ jobs:
         ]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
           python-version: "3.x"
       - run: pip install PyGithub
       - run: sudo apt-get install musl-tools
         if: matrix.target.os == 'ubuntu-latest'
-      - uses: actions-rs/toolchain@v1
+      - uses: dtolnay/rust-toolchain@stable
         with:
-          toolchain: stable
-          profile: minimal
-      - run: rustup target add ${{ matrix.target.rust-target }}
+          targets: ${{ matrix.target.rust-target }}
       - name: build b3sum
         id: build_b3sum
         run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }}
diff --git a/third-party/blake3/Cargo.toml b/third-party/blake3/Cargo.toml
index 8df13874..55eb8a41 100644
--- a/third-party/blake3/Cargo.toml
+++ b/third-party/blake3/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "blake3"
-version = "1.4.1"
+version = "1.5.1"
 authors = ["Jack O'Connor <oconnor663@gmail.com>", "Samuel Neves"]
 description = "the BLAKE3 hash function"
 repository = "https://github.com/BLAKE3-team/BLAKE3"
@@ -23,11 +23,21 @@ neon = []
 # --no-default-features, the only way to use the SIMD implementations in this
 # crate is to enable the corresponding instruction sets statically for the
 # entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
-std = ["digest/std"]
+std = []
 
-# The "rayon" feature (defined below as an optional dependency) enables the
-# `Hasher::update_rayon` method, for multithreaded hashing. However, even if
-# this feature is enabled, all other APIs remain single-threaded.
+# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the
+# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon`
+# methods, for multithreaded hashing. However, even if this feature is enabled,
+# all other APIs remain single-threaded.
+rayon = ["dep:rayon", "std"]
+
+# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the
+# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon`
+# helper methods for memory-mapped IO.
+mmap = ["std", "dep:memmap2"]
+
+# Implement the zeroize::Zeroize trait for types in this crate.
+zeroize = ["dep:zeroize", "arrayvec/zeroize"]
 
 # This crate implements traits from the RustCrypto project, exposed here as the
 # "traits-preview" feature. However, these traits aren't stable, and they're
@@ -78,24 +88,29 @@ no_avx512 = []
 no_neon = []
 
 [package.metadata.docs.rs]
-# Document Hasher::update_rayon on docs.rs.
-features = ["rayon"]
+# Document the rayon/mmap methods and the Serialize/Deserialize/Zeroize impls on docs.rs.
+features = ["mmap", "rayon", "serde", "zeroize"]
 
 [dependencies]
 arrayref = "0.3.5"
-arrayvec = { version = "0.7.0", default-features = false }
+arrayvec = { version = "0.7.4", default-features = false }
 constant_time_eq = "0.3.0"
-rayon = { version = "1.2.1", optional = true }
 cfg-if = "1.0.0"
 digest = { version = "0.10.1", features = [ "mac" ], optional = true }
+memmap2 = { version = "0.9", optional = true }
+rayon = { version = "1.2.1", optional = true }
+serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
+zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true }
 
 [dev-dependencies]
+hmac = "0.12.0"
 hex = "0.4.2"
-page_size = "0.5.0"
+page_size = "0.6.0"
 rand = "0.8.0"
 rand_chacha = "0.3.0"
 reference_impl = { path = "./reference_impl" }
-hmac = "0.12.0"
+tempfile = "3.8.0"
+serde_json = "1.0.107"
 
 [build-dependencies]
 cc = "1.0.4"
diff --git a/third-party/blake3/README.md b/third-party/blake3/README.md
index a63d5f2c..6b493775 100644
--- a/third-party/blake3/README.md
+++ b/third-party/blake3/README.md
@@ -201,6 +201,7 @@ Alternatively, it is licensed under the Apache License 2.0.
 Here's a (non-exhaustive) list of protocols and software that use BLAKE3:
 
 * [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala)
+* [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0)
 * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16)
 * [IPFS](https://github.com/ipfs/go-verifcid/issues/13)
 * [Farcaster](https://www.farcaster.xyz/)
@@ -211,6 +212,7 @@ Here's a (non-exhaustive) list of protocols and software that use BLAKE3:
 * [Saito](https://saito.tech/)
 * [Skale](https://github.com/skalenetwork/skale-consensus/pull/284)
 * [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html)
+* [Tekken 8](https://en.bandainamcoent.eu/tekken/tekken-8)
 * [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21)
 
 
diff --git a/third-party/blake3/b3sum/Cargo.lock b/third-party/blake3/b3sum/Cargo.lock
index 2a599a85..2300d3bf 100644
--- a/third-party/blake3/b3sum/Cargo.lock
+++ b/third-party/blake3/b3sum/Cargo.lock
@@ -4,58 +4,57 @@ version = 3
 
 [[package]]
 name = "anstream"
-version = "0.3.2"
+version = "0.6.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
+checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
- "is-terminal",
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle"
-version = "1.0.1"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd"
+checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.1"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
+checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.0.0"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
+checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.1"
+version = "3.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
+checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
 dependencies = [
  "anstyle",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.71"
+version = "1.0.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
+checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247"
 
 [[package]]
 name = "arrayref"
@@ -69,22 +68,15 @@ version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 
-[[package]]
-name = "autocfg"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
-
 [[package]]
 name = "b3sum"
-version = "1.4.1"
+version = "1.5.1"
 dependencies = [
  "anyhow",
  "blake3",
  "clap",
  "duct",
  "hex",
- "memmap2",
  "rayon",
  "tempfile",
  "wild",
@@ -92,43 +84,28 @@ dependencies = [
 
 [[package]]
 name = "bitflags"
-version = "1.3.2"
+version = "2.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
-[[package]]
-name = "bitflags"
-version = "2.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
 
 [[package]]
 name = "blake3"
-version = "1.4.1"
+version = "1.5.1"
 dependencies = [
  "arrayref",
  "arrayvec",
  "cc",
  "cfg-if",
  "constant_time_eq",
- "digest",
+ "memmap2",
  "rayon",
 ]
 
-[[package]]
-name = "block-buffer"
-version = "0.10.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
-dependencies = [
- "generic-array",
-]
-
 [[package]]
 name = "cc"
-version = "1.0.79"
+version = "1.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
 
 [[package]]
 name = "cfg-if"
@@ -138,20 +115,19 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "clap"
-version = "4.3.11"
+version = "4.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d"
+checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651"
 dependencies = [
  "clap_builder",
  "clap_derive",
- "once_cell",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.3.11"
+version = "4.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b"
+checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
 dependencies = [
  "anstream",
  "anstyle",
@@ -162,9 +138,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.3.2"
+version = "4.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f"
+checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47"
 dependencies = [
  "heck",
  "proc-macro2",
@@ -174,9 +150,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
+checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
 
 [[package]]
 name = "colorchoice"
@@ -190,75 +166,36 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2"
 
-[[package]]
-name = "crossbeam-channel"
-version = "0.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
-dependencies = [
- "cfg-if",
- "crossbeam-utils",
-]
-
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 dependencies = [
- "cfg-if",
  "crossbeam-epoch",
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.15"
+version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
- "autocfg",
- "cfg-if",
  "crossbeam-utils",
- "memoffset",
- "scopeguard",
 ]
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.16"
+version = "0.8.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "crypto-common"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
-dependencies = [
- "generic-array",
- "typenum",
-]
-
-[[package]]
-name = "digest"
-version = "0.10.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
-dependencies = [
- "block-buffer",
- "crypto-common",
- "subtle",
-]
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
 
 [[package]]
 name = "duct"
-version = "0.13.6"
+version = "0.13.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37ae3fc31835f74c2a7ceda3aeede378b0ae2e74c8f1c36559fcc9ae2a4e7d3e"
+checksum = "e4ab5718d1224b63252cd0c6f74f6480f9ffeb117438a2e0f5cf6d9a4798929c"
 dependencies = [
  "libc",
  "once_cell",
@@ -268,49 +205,25 @@ dependencies = [
 
 [[package]]
 name = "either"
-version = "1.8.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
+checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
 
 [[package]]
 name = "errno"
-version = "0.3.1"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
  "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "fastrand"
-version = "1.9.0"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
-dependencies = [
- "instant",
-]
-
-[[package]]
-name = "generic-array"
-version = "0.14.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
-dependencies = [
- "typenum",
- "version_check",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
 
 [[package]]
 name = "glob"
@@ -324,134 +237,72 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
-[[package]]
-name = "hermit-abi"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
-
 [[package]]
 name = "hex"
 version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
 
-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "io-lifetimes"
-version = "1.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
-dependencies = [
- "hermit-abi",
- "libc",
- "windows-sys",
-]
-
-[[package]]
-name = "is-terminal"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
-dependencies = [
- "hermit-abi",
- "rustix 0.38.3",
- "windows-sys",
-]
-
 [[package]]
 name = "libc"
-version = "0.2.147"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.3.8"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
 [[package]]
 name = "memmap2"
-version = "0.7.1"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6"
+checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
 dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "memoffset"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "num_cpus"
-version = "1.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
-dependencies = [
- "hermit-abi",
- "libc",
-]
-
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "os_pipe"
-version = "1.1.4"
+version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177"
+checksum = "57119c3b893986491ec9aa85056780d3a0f3cf4da7cc09dd3650dbd6c6738fb9"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.63"
+version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb"
+checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.29"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
  "proc-macro2",
 ]
 
 [[package]]
 name = "rayon"
-version = "1.7.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
+checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd"
 dependencies = [
  "either",
  "rayon-core",
@@ -459,58 +310,27 @@ dependencies = [
 
 [[package]]
 name = "rayon-core"
-version = "1.11.0"
+version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
 dependencies = [
- "crossbeam-channel",
  "crossbeam-deque",
  "crossbeam-utils",
- "num_cpus",
-]
-
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
 ]
 
 [[package]]
 name = "rustix"
-version = "0.37.23"
+version = "0.38.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
 dependencies = [
- "bitflags 1.3.2",
- "errno",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.3.8",
- "windows-sys",
-]
-
-[[package]]
-name = "rustix"
-version = "0.38.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4"
-dependencies = [
- "bitflags 2.3.3",
+ "bitflags",
  "errno",
  "libc",
- "linux-raw-sys 0.4.3",
- "windows-sys",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "scopeguard"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
-
 [[package]]
 name = "shared_child"
 version = "1.0.0"
@@ -523,21 +343,15 @@ dependencies = [
 
 [[package]]
 name = "strsim"
-version = "0.10.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
-
-[[package]]
-name = "subtle"
-version = "2.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01"
 
 [[package]]
 name = "syn"
-version = "2.0.23"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -546,39 +360,31 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.6.0"
+version = "3.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
 dependencies = [
- "autocfg",
  "cfg-if",
  "fastrand",
- "redox_syscall",
- "rustix 0.37.23",
- "windows-sys",
+ "rustix",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "terminal_size"
-version = "0.2.6"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
+checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
 dependencies = [
- "rustix 0.37.23",
- "windows-sys",
+ "rustix",
+ "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "typenum"
-version = "1.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
-
 [[package]]
 name = "unicode-ident"
-version = "1.0.10"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "utf8parse"
@@ -586,17 +392,11 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
-[[package]]
-name = "version_check"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
-
 [[package]]
 name = "wild"
-version = "2.1.0"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05b116685a6be0c52f5a103334cbff26db643826c7b3735fc0a3ba9871310a74"
+checksum = "a3131afc8c575281e1e80f36ed6a092aa502c08b18ed7524e86fbbb12bb410e1"
 dependencies = [
  "glob",
 ]
@@ -629,62 +429,128 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
 name = "windows-targets"
-version = "0.48.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
 ]
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
diff --git a/third-party/blake3/b3sum/Cargo.toml b/third-party/blake3/b3sum/Cargo.toml
index 02c9405f..812ed224 100644
--- a/third-party/blake3/b3sum/Cargo.toml
+++ b/third-party/blake3/b3sum/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "b3sum"
-version = "1.4.1"
+version = "1.5.1"
 authors = ["Jack O'Connor <oconnor663@gmail.com>"]
 description = "a command line implementation of the BLAKE3 hash function"
 repository = "https://github.com/BLAKE3-team/BLAKE3"
@@ -15,10 +15,9 @@ pure = ["blake3/pure"]
 
 [dependencies]
 anyhow = "1.0.25"
-blake3 = { version = "1", path = "..", features = ["rayon"] }
+blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
 clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
-memmap2 = "0.7.0"
 rayon = "1.2.1"
 wild = "2.0.3"
 
diff --git a/third-party/blake3/b3sum/src/main.rs b/third-party/blake3/b3sum/src/main.rs
index fd35f686..228737ff 100644
--- a/third-party/blake3/b3sum/src/main.rs
+++ b/third-party/blake3/b3sum/src/main.rs
@@ -163,125 +163,22 @@ impl Args {
     }
 }
 
-enum Input {
-    Mmap(io::Cursor<memmap2::Mmap>),
-    File(File),
-    Stdin,
-}
-
-impl Input {
-    // Open an input file, using mmap if appropriate. "-" means stdin. Note
-    // that this convention applies both to command line arguments, and to
-    // filepaths that appear in a checkfile.
-    fn open(path: &Path, args: &Args) -> Result<Self> {
-        if path == Path::new("-") {
-            if args.keyed() {
-                bail!("Cannot open `-` in keyed mode");
-            }
-            return Ok(Self::Stdin);
+fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
+    let mut hasher = args.base_hasher.clone();
+    if path == Path::new("-") {
+        if args.keyed() {
+            bail!("Cannot open `-` in keyed mode");
         }
-        let file = File::open(path)?;
-        if !args.no_mmap() {
-            if let Some(mmap) = maybe_memmap_file(&file)? {
-                return Ok(Self::Mmap(io::Cursor::new(mmap)));
-            }
-        }
-        Ok(Self::File(file))
-    }
-
-    fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
-        let mut hasher = args.base_hasher.clone();
-        match self {
-            // The fast path: If we mmapped the file successfully, hash using
-            // multiple threads. This doesn't work on stdin, or on some files,
-            // and it can also be disabled with --no-mmap.
-            Self::Mmap(cursor) => {
-                hasher.update_rayon(cursor.get_ref());
-            }
-            // The slower paths, for stdin or files we didn't/couldn't mmap.
-            // This is currently all single-threaded. Doing multi-threaded
-            // hashing without memory mapping is tricky, since all your worker
-            // threads have to stop every time you refill the buffer, and that
-            // ends up being a lot of overhead. To solve that, we need a more
-            // complicated double-buffering strategy where a background thread
-            // fills one buffer while the worker threads are hashing the other
-            // one. We might implement that in the future, but since this is
-            // the slow path anyway, it's not high priority.
-            Self::File(file) => {
-                copy_wide(file, &mut hasher)?;
-            }
-            Self::Stdin => {
-                let stdin = io::stdin();
-                let lock = stdin.lock();
-                copy_wide(lock, &mut hasher)?;
-            }
-        }
-        let mut output_reader = hasher.finalize_xof();
-        output_reader.set_position(args.seek());
-        Ok(output_reader)
-    }
-}
-
-impl Read for Input {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Mmap(cursor) => cursor.read(buf),
-            Self::File(file) => file.read(buf),
-            Self::Stdin => io::stdin().read(buf),
-        }
-    }
-}
-
-// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
-// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
-// can support at least 64 KiB, and there's some performance benefit to using
-// bigger reads, so that's what we use here.
-fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
-    let mut buffer = [0; 65536];
-    let mut total = 0;
-    loop {
-        match reader.read(&mut buffer) {
-            Ok(0) => return Ok(total),
-            Ok(n) => {
-                hasher.update(&buffer[..n]);
-                total += n as u64;
-            }
-            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
-            Err(e) => return Err(e),
-        }
-    }
-}
-
-// Mmap a file, if it looks like a good idea. Return None in cases where we
-// know mmap will fail, or if the file is short enough that mmapping isn't
-// worth it. However, if we do try to mmap and it fails, return the error.
-fn maybe_memmap_file(file: &File) -> Result<Option<memmap2::Mmap>> {
-    let metadata = file.metadata()?;
-    let file_size = metadata.len();
-    Ok(if !metadata.is_file() {
-        // Not a real file.
-        None
-    } else if file_size > isize::max_value() as u64 {
-        // Too long to safely map.
-        // https://github.com/danburkert/memmap-rs/issues/69
-        None
-    } else if file_size == 0 {
-        // Mapping an empty file currently fails.
-        // https://github.com/danburkert/memmap-rs/issues/72
-        None
-    } else if file_size < 16 * 1024 {
-        // Mapping small files is not worth it.
-        None
+        hasher.update_reader(io::stdin().lock())?;
+    } else if args.no_mmap() {
+        hasher.update_reader(File::open(path)?)?;
     } else {
-        // Explicitly set the length of the memory map, so that filesystem
-        // changes can't race to violate the invariants we just checked.
-        let map = unsafe {
-            memmap2::MmapOptions::new()
-                .len(file_size as usize)
-                .map(file)?
-        };
-        Some(map)
-    })
+        // The fast path: Try to mmap the file and hash it with multiple threads.
+        hasher.update_mmap_rayon(path)?;
+    }
+    let mut output_reader = hasher.finalize_xof();
+    output_reader.set_position(args.seek());
+    Ok(output_reader)
 }
 
 fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
@@ -477,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
 }
 
 fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
-    let mut input = Input::open(path, args)?;
-    let output = input.hash(args)?;
+    let output = hash_path(args, path)?;
     if args.raw() {
         write_raw_output(output, args)?;
         return Ok(());
@@ -522,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool {
     } else {
         file_string
     };
-    let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
-        .and_then(|mut input| input.hash(args))
-        .map(|mut hash_output| {
+    let found_hash: blake3::Hash;
+    match hash_path(args, &file_path) {
+        Ok(mut output) => {
             let mut found_hash_bytes = [0; blake3::OUT_LEN];
-            hash_output.fill(&mut found_hash_bytes);
-            found_hash_bytes.into()
-        });
-    let found_hash: blake3::Hash = match hash_result {
-        Ok(hash) => hash,
+            output.fill(&mut found_hash_bytes);
+            found_hash = found_hash_bytes.into();
+        }
         Err(e) => {
             println!("{}: FAILED ({})", file_string, e);
             return false;
@@ -549,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool {
 }
 
 fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> {
-    let checkfile_input = Input::open(path, args)?;
-    let mut bufreader = io::BufReader::new(checkfile_input);
+    let mut file;
+    let stdin;
+    let mut stdin_lock;
+    let mut bufreader: io::BufReader<&mut dyn Read>;
+    if path == Path::new("-") {
+        stdin = io::stdin();
+        stdin_lock = stdin.lock();
+        bufreader = io::BufReader::new(&mut stdin_lock);
+    } else {
+        file = File::open(path)?;
+        bufreader = io::BufReader::new(&mut file);
+    }
     let mut line = String::new();
     loop {
         line.clear();
diff --git a/third-party/blake3/build.rs b/third-party/blake3/build.rs
index ac1d6a64..a5dfd062 100644
--- a/third-party/blake3/build.rs
+++ b/third-party/blake3/build.rs
@@ -60,6 +60,20 @@ fn is_armv7() -> bool {
     target_components()[0] == "armv7"
 }
 
+fn endianness() -> String {
+    let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap();
+    assert!(endianness == "little" || endianness == "big");
+    endianness
+}
+
+fn is_little_endian() -> bool {
+    endianness() == "little"
+}
+
+fn is_big_endian() -> bool {
+    endianness() == "big"
+}
+
 // Windows targets may be using the MSVC toolchain or the GNU toolchain. The
 // right compiler flags to use depend on the toolchain. (And we don't want to
 // use flag_if_supported, because we don't want features to be silently
@@ -253,7 +267,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         }
     }
 
-    if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64()) {
+    if is_neon() && is_big_endian() {
+        panic!("The NEON implementation doesn't support big-endian ARM.")
+    }
+
+    if (is_arm() && is_neon())
+        || (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian())
+    {
         println!("cargo:rustc-cfg=blake3_neon");
         build_neon_c_intrinsics();
     }
diff --git a/third-party/blake3/c/CMakeLists.txt b/third-party/blake3/c/CMakeLists.txt
index 3190effa..3a3b232d 100644
--- a/third-party/blake3/c/CMakeLists.txt
+++ b/third-party/blake3/c/CMakeLists.txt
@@ -1,15 +1,23 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
+
+# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD
+if (POLICY CMP0128)
+  cmake_policy(SET CMP0128 NEW)
+endif()
 
 project(libblake3
-  VERSION 1.4.1
+  VERSION 1.5.1
   DESCRIPTION "BLAKE3 C implementation"
   LANGUAGES C ASM
 )
 
-include(CheckCCompilerFlag)
 include(FeatureSummary)
 include(GNUInstallDirs)
 
+# architecture lists for which to enable assembly / SIMD sources
+set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
+set(BLAKE3_X86_NAMES i686 x86 X86)
+set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
 # default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
 if(MSVC)
   set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
@@ -25,11 +33,13 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
   set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
   set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
   set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
+
+  if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
+      AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # 32-bit ARMv8 needs NEON to be enabled explicitly
+    set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON")
+  endif()
 endif()
-# architecture lists for which to enable assembly / SIMD sources
-set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
-set(BLAKE3_X86_NAMES i686 x86 X86)
-set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
 
 # library target
 add_library(blake3
@@ -42,26 +52,40 @@ add_library(BLAKE3::blake3 ALIAS blake3)
 # library configuration
 set(BLAKE3_PKGCONFIG_CFLAGS)
 if (BUILD_SHARED_LIBS)
-  target_compile_definitions(blake3 
+  target_compile_definitions(blake3
     PUBLIC BLAKE3_DLL
     PRIVATE BLAKE3_DLL_EXPORTS
   )
   list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL)
 endif()
-target_include_directories(blake3 PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+target_include_directories(blake3 PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
 set_target_properties(blake3 PROPERTIES
   VERSION ${PROJECT_VERSION}
   SOVERSION 0
   C_VISIBILITY_PRESET hidden
+  C_EXTENSIONS OFF
 )
+target_compile_features(blake3 PUBLIC c_std_99)
+# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD
+# which may be set by the user or toolchain file
+if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD)
+  set_target_properties(blake3 PROPERTIES C_STANDARD 99)
+endif()
 
 # optional SIMD sources
 macro(BLAKE3_DISABLE_SIMD)
   set(BLAKE3_SIMD_AMD64_ASM OFF)
   set(BLAKE3_SIMD_X86_INTRINSICS OFF)
   set(BLAKE3_SIMD_NEON_INTRINSICS OFF)
-  set_source_files_properties(blake3_dispatch.c PROPERTIES 
-    COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512
+  target_compile_definitions(blake3 PRIVATE
+    BLAKE3_USE_NEON=0
+    BLAKE3_NO_SSE2
+    BLAKE3_NO_SSE41
+    BLAKE3_NO_AVX2
+    BLAKE3_NO_AVX512
   )
 endmacro()
 
@@ -100,7 +124,7 @@ if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM)
       BLAKE3_DISABLE_SIMD()
     endif()
 
-  else()  
+  else()
     BLAKE3_DISABLE_SIMD()
   endif()
 
@@ -122,22 +146,19 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN
   set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
   set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")
 
-elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
-        OR ((ANDROID_ABI STREQUAL "armeabi-v7a"
-            OR BLAKE3_USE_NEON_INTRINSICS)
-          AND (DEFINED BLAKE3_CFLAGS_NEON
-            OR CMAKE_SIZEOF_VOID_P EQUAL 8)))
+elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
+          OR ANDROID_ABI STREQUAL "armeabi-v7a"
+          OR BLAKE3_USE_NEON_INTRINSICS)
+        AND (DEFINED BLAKE3_CFLAGS_NEON
+          OR CMAKE_SIZEOF_VOID_P EQUAL 8))
   set(BLAKE3_SIMD_NEON_INTRINSICS ON)
 
   target_sources(blake3 PRIVATE
     blake3_neon.c
   )
-  target_compile_options(blake3 PRIVATE -DBLAKE3_USE_NEON=1)
-
-  check_c_compiler_flag(-mfpu=neon BLAKE3_MFPU_NEON_SUPPORTED)
-  if (BLAKE3_MFPU_NEON_SUPPORTED)
-    target_compile_options(blake3 PRIVATE -mfpu=neon)
-  endif()
+  target_compile_definitions(blake3 PRIVATE
+    BLAKE3_USE_NEON=1
+  )
 
   if (DEFINED BLAKE3_CFLAGS_NEON)
     set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
diff --git a/third-party/blake3/c/blake3.c b/third-party/blake3/c/blake3.c
index 692f4b02..1b44c719 100644
--- a/third-party/blake3/c/blake3.c
+++ b/third-party/blake3/c/blake3.c
@@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node(
   size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
                                                 chunk_counter, flags, cv_array);
   assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-
-  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+  // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+  // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+  // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+  // set then it emits incorrect warnings here. We tried a few different
+  // hacks to silence these, but in the end our hacks just produced different
+  // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+  // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+  // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
   // compress_subtree_wide() returns more than 2 chaining values. Condense
   // them into 2 by forming parent nodes repeatedly.
   uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
-  // The second half of this loop condition is always true, and we just
-  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
-  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
-  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
-  // this code, test it against that version.
-  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+  while (num_cvs > 2) {
     num_cvs =
         compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
     memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
   }
+#endif
   memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
 }
 
diff --git a/third-party/blake3/c/blake3.h b/third-party/blake3/c/blake3.h
index 21e0d7b9..48284e50 100644
--- a/third-party/blake3/c/blake3.h
+++ b/third-party/blake3/c/blake3.h
@@ -30,7 +30,7 @@
 extern "C" {
 #endif
 
-#define BLAKE3_VERSION_STRING "1.4.1"
+#define BLAKE3_VERSION_STRING "1.5.1"
 #define BLAKE3_KEY_LEN 32
 #define BLAKE3_OUT_LEN 32
 #define BLAKE3_BLOCK_LEN 64
diff --git a/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml
index fff9f416..c1aee32e 100644
--- a/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml
+++ b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml
@@ -20,9 +20,9 @@ neon = []
 [dev-dependencies]
 arrayref = "0.3.5"
 arrayvec = { version = "0.7.0", default-features = false }
-page_size = "0.4.1"
-rand = "0.7.2"
-rand_chacha = "0.2.1"
+page_size = "0.6.0"
+rand = "0.8.5"
+rand_chacha = "0.3.1"
 reference_impl = { path = "../../reference_impl" }
 
 [build-dependencies]
diff --git a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs
index 1fc077c8..0730d930 100644
--- a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs
+++ b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs
@@ -485,7 +485,7 @@ fn test_fuzz_hasher() {
         let mut total_input = 0;
         // For each test, write 3 inputs of random length.
         for _ in 0..3 {
-            let input_len = rng.gen_range(0, INPUT_MAX + 1);
+            let input_len = rng.gen_range(0..INPUT_MAX + 1);
             dbg!(input_len);
             let input = &input_buf[total_input..][..input_len];
             hasher.update(input);
diff --git a/third-party/blake3/c/blake3_dispatch.c b/third-party/blake3/c/blake3_dispatch.c
index 2ab0093e..af6c3dad 100644
--- a/third-party/blake3/c/blake3_dispatch.c
+++ b/third-party/blake3/c/blake3_dispatch.c
@@ -6,6 +6,7 @@
 
 #if defined(IS_X86)
 #if defined(_MSC_VER)
+#include <Windows.h>
 #include <intrin.h>
 #elif defined(__GNUC__)
 #include <immintrin.h>
@@ -14,6 +15,32 @@
 #endif
 #endif
 
+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
+
 #define MAYBE_UNUSED(x) (void)((x))
 
 #if defined(IS_X86)
@@ -76,7 +103,7 @@ enum cpu_feature {
 #if !defined(BLAKE3_TESTING)
 static /* Allow the variable to be controlled manually for testing */
 #endif
-    enum cpu_feature g_cpu_features = UNDEFINED;
+    ATOMIC_INT g_cpu_features = UNDEFINED;
 
 #if !defined(BLAKE3_TESTING)
 static
@@ -84,14 +111,16 @@ static
     enum cpu_feature
     get_cpu_features(void) {
 
-  if (g_cpu_features != UNDEFINED) {
-    return g_cpu_features;
+  /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+  enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+  if (features != UNDEFINED) {
+    return features;
   } else {
 #if defined(IS_X86)
     uint32_t regs[4] = {0};
     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
     (void)edx;
-    enum cpu_feature features = 0;
+    features = 0;
     cpuid(regs, 0);
     const int max_id = *eax;
     cpuid(regs, 1);
@@ -124,7 +153,7 @@ static
         }
       }
     }
-    g_cpu_features = features;
+    ATOMIC_STORE(g_cpu_features, features);
     return features;
 #else
     /* How to detect NEON? */
diff --git a/third-party/blake3/c/blake3_impl.h b/third-party/blake3/c/blake3_impl.h
index 3ba9ceb0..beab5cf5 100644
--- a/third-party/blake3/c/blake3_impl.h
+++ b/third-party/blake3/c/blake3_impl.h
@@ -51,7 +51,11 @@ enum blake3_flags {
 #if !defined(BLAKE3_USE_NEON) 
   // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
   #if defined(IS_AARCH64)
-    #define BLAKE3_USE_NEON 1
+    #if defined(__ARM_BIG_ENDIAN)
+      #define BLAKE3_USE_NEON 0
+    #else
+      #define BLAKE3_USE_NEON 1
+    #endif
   #else
     #define BLAKE3_USE_NEON 0
   #endif
diff --git a/third-party/blake3/c/blake3_neon.c b/third-party/blake3/c/blake3_neon.c
index 8a818fc7..90bdd572 100644
--- a/third-party/blake3/c/blake3_neon.c
+++ b/third-party/blake3/c/blake3_neon.c
@@ -10,14 +10,12 @@
 
 INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
   // vld1q_u32 has alignment requirements. Don't use it.
-  uint32x4_t x;
-  memcpy(&x, src, 16);
-  return x;
+  return vreinterpretq_u32_u8(vld1q_u8(src));
 }
 
 INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
   // vst1q_u32 has alignment requirements. Don't use it.
-  memcpy(dest, &src, 16);
+  vst1q_u8(dest, vreinterpretq_u8_u32(src));
 }
 
 INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
diff --git a/third-party/blake3/rust/guts/Cargo.toml b/third-party/blake3/rust/guts/Cargo.toml
new file mode 100644
index 00000000..ebcf77fd
--- /dev/null
+++ b/third-party/blake3/rust/guts/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "blake3_guts"
+version = "0.0.0"
+authors = ["Jack O'Connor <oconnor663@gmail.com>", "Samuel Neves"]
+description = "low-level building blocks for the BLAKE3 hash function"
+repository = "https://github.com/BLAKE3-team/BLAKE3"
+license = "CC0-1.0 OR Apache-2.0"
+documentation = "https://docs.rs/blake3_guts"
+readme = "readme.md"
+edition = "2021"
+
+[dev-dependencies]
+hex = "0.4.3"
+reference_impl = { path = "../../reference_impl" }
+
+[features]
+default = ["std"]
+std = []
diff --git a/third-party/blake3/rust/guts/readme.md b/third-party/blake3/rust/guts/readme.md
new file mode 100644
index 00000000..4957816d
--- /dev/null
+++ b/third-party/blake3/rust/guts/readme.md
@@ -0,0 +1,80 @@
+# The BLAKE3 Guts API
+
+## Introduction
+
+This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains
+low-level, high-performance, platform-specific implementations of the BLAKE3
+compression function. This API is complicated and unsafe, and this crate will
+never have a stable release. Most callers should instead use the
+[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend
+on this one internally.
+
+The code you see here (as of January 2024) is an early stage of a large planned
+refactor. The motivation for this refactor is a couple of missing features in
+both the Rust and C implementations:
+
+- The output side
+  ([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html)
+  in Rust) doesn't take advantage of the most important SIMD optimizations that
+  compute multiple blocks in parallel. This blocks any project that wants to
+  use the BLAKE3 XOF as a stream cipher
+  ([[1]](https://github.com/oconnor663/bessie),
+  [[2]](https://github.com/oconnor663/blake3_aead)).
+- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need
+  interior nodes of the tree also don't get those SIMD optimizations. They have
+  to use a slow, minimalistic, unstable, doc-hidden module [(also called
+  `guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs).
+
+The difficulty with adding those features is that they require changes to all
+of our optimized assembly and C intrinsics code. That's a couple dozen
+different files that are large, platform-specific, difficult to understand, and
+full of duplicated code. The higher-level Rust and C implementations of BLAKE3
+both depend on these files and will need to coordinate changes.
+
+At the same time, it won't be long before we add support for more platforms:
+
+- RISCV vector extensions
+- ARM SVE
+- WebAssembly SIMD
+
+It's important to get this refactor done before new platforms make it even
+harder to do.
+
+## The private guts API
+
+This is the API that each platform reimplements, so we want it to be as simple
+as possible apart from the high-performance work it needs to do. It's
+completely `unsafe`, and inputs and outputs are raw pointers that are allowed
+to alias (this matters for `hash_parents`, see below).
+
+- `degree`
+- `compress`
+    - The single compression function, for short inputs and odd-length tails.
+- `hash_chunks`
+- `hash_parents`
+- `xof`
+- `xof_xor`
+    - As `xof` but XOR'ing the result into the output buffer.
+- `universal_hash`
+    - This is a new construction specifically to support
+      [BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some
+      implementations might just stub it out with portable code.
+
+## The public guts API
+
+This is the API that this crate exposes to callers, i.e. to the main `blake3`
+crate. It's a thin, portable layer on top of the private API above. The Rust
+version of this API is memory-safe.
+
+- `degree`
+- `compress`
+- `hash_chunks`
+- `hash_parents`
+    - This handles most levels of the tree, where we keep hashing SIMD_DEGREE
+      parents at a time.
+- `reduce_parents`
+    - This uses the same `hash_parents` private API, but it handles the top
+      levels of the tree where we reduce in-place to the root parent node.
+- `xof`
+- `xof_xor`
+- `universal_hash`
diff --git a/third-party/blake3/rust/guts/src/lib.rs b/third-party/blake3/rust/guts/src/lib.rs
new file mode 100644
index 00000000..e9b4914b
--- /dev/null
+++ b/third-party/blake3/rust/guts/src/lib.rs
@@ -0,0 +1,1000 @@
+//! # The BLAKE3 Guts API
+//!
+//! See `readme.md`.
+//!
+//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`]
+//! that atomically initializes itself the first time you use it.
+//!
+//! # Example
+//!
+//! ```rust
+//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT};
+//!
+//! // Hash an input of exactly two chunks.
+//! let input = [0u8; 2048];
+//! let mut outputs = TransposedVectors::new();
+//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs);
+//! DETECTED_IMPL.hash_chunks(
+//!     &input,
+//!     &IV_BYTES,
+//!     0, // counter
+//!     0, // flags
+//!     left_outputs,
+//! );
+//! let root_node = outputs.extract_parent_node(0);
+//! let hash = DETECTED_IMPL.compress(
+//!     &root_node,
+//!     64, // block_len
+//!     &IV_BYTES,
+//!     0, // counter
+//!     PARENT | ROOT,
+//! );
+//!
+//! // Compute the same hash using the reference implementation.
+//! let mut reference_hasher = reference_impl::Hasher::new();
+//! reference_hasher.update(&input);
+//! let mut expected_hash = [0u8; 32];
+//! reference_hasher.finalize(&mut expected_hash);
+//!
+//! assert_eq!(hash, expected_hash);
+//! ```
+
+// Tests always require libstd.
+#![cfg_attr(all(not(feature = "std"), not(test)), no_std)]
+
+use core::cmp;
+use core::marker::PhantomData;
+use core::mem;
+use core::ptr;
+use core::sync::atomic::{AtomicPtr, Ordering::Relaxed};
+
+pub mod portable;
+
+#[cfg(test)]
+mod test;
+
+pub const OUT_LEN: usize = 32;
+pub const BLOCK_LEN: usize = 64;
+pub const CHUNK_LEN: usize = 1024;
+pub const WORD_LEN: usize = 4;
+pub const UNIVERSAL_HASH_LEN: usize = 16;
+
+pub const CHUNK_START: u32 = 1 << 0;
+pub const CHUNK_END: u32 = 1 << 1;
+pub const PARENT: u32 = 1 << 2;
+pub const ROOT: u32 = 1 << 3;
+pub const KEYED_HASH: u32 = 1 << 4;
+pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5;
+pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6;
+
+pub const IV: CVWords = [
+    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
+];
+pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV);
+
+pub const MSG_SCHEDULE: [[usize; 16]; 7] = [
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+    [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
+    [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
+    [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
+    [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
+    [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
+    [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
+];
+
+// never less than 2
+pub const MAX_SIMD_DEGREE: usize = 2;
+
+pub type CVBytes = [u8; 32];
+pub type CVWords = [u32; 8];
+pub type BlockBytes = [u8; 64];
+pub type BlockWords = [u32; 16];
+
+pub static DETECTED_IMPL: Implementation = Implementation::new(
+    degree_init,
+    compress_init,
+    hash_chunks_init,
+    hash_parents_init,
+    xof_init,
+    xof_xor_init,
+    universal_hash_init,
+);
+
+fn detect() -> Implementation {
+    portable::implementation()
+}
+
+fn init_detected_impl() {
+    let detected = detect();
+
+    DETECTED_IMPL
+        .degree_ptr
+        .store(detected.degree_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .compress_ptr
+        .store(detected.compress_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .hash_chunks_ptr
+        .store(detected.hash_chunks_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .hash_parents_ptr
+        .store(detected.hash_parents_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .xof_ptr
+        .store(detected.xof_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .xof_xor_ptr
+        .store(detected.xof_xor_ptr.load(Relaxed), Relaxed);
+    DETECTED_IMPL
+        .universal_hash_ptr
+        .store(detected.universal_hash_ptr.load(Relaxed), Relaxed);
+}
+
+pub struct Implementation {
+    degree_ptr: AtomicPtr<()>,
+    compress_ptr: AtomicPtr<()>,
+    hash_chunks_ptr: AtomicPtr<()>,
+    hash_parents_ptr: AtomicPtr<()>,
+    xof_ptr: AtomicPtr<()>,
+    xof_xor_ptr: AtomicPtr<()>,
+    universal_hash_ptr: AtomicPtr<()>,
+}
+
+impl Implementation {
+    const fn new(
+        degree_fn: DegreeFn,
+        compress_fn: CompressFn,
+        hash_chunks_fn: HashChunksFn,
+        hash_parents_fn: HashParentsFn,
+        xof_fn: XofFn,
+        xof_xor_fn: XofFn,
+        universal_hash_fn: UniversalHashFn,
+    ) -> Self {
+        Self {
+            degree_ptr: AtomicPtr::new(degree_fn as *mut ()),
+            compress_ptr: AtomicPtr::new(compress_fn as *mut ()),
+            hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()),
+            hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()),
+            xof_ptr: AtomicPtr::new(xof_fn as *mut ()),
+            xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()),
+            universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()),
+        }
+    }
+
+    #[inline]
+    fn degree_fn(&self) -> DegreeFn {
+        unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn degree(&self) -> usize {
+        let degree = unsafe { self.degree_fn()() };
+        debug_assert!(degree >= 2);
+        debug_assert!(degree <= MAX_SIMD_DEGREE);
+        debug_assert_eq!(1, degree.count_ones(), "power of 2");
+        degree
+    }
+
+    #[inline]
+    pub fn split_transposed_vectors<'v>(
+        &self,
+        vectors: &'v mut TransposedVectors,
+    ) -> (TransposedSplit<'v>, TransposedSplit<'v>) {
+        unsafe { vectors.split(self.degree()) }
+    }
+
+    #[inline]
+    fn compress_fn(&self) -> CompressFn {
+        unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn compress(
+        &self,
+        block: &BlockBytes,
+        block_len: u32,
+        cv: &CVBytes,
+        counter: u64,
+        flags: u32,
+    ) -> CVBytes {
+        let mut out = [0u8; 32];
+        unsafe {
+            self.compress_fn()(block, block_len, cv, counter, flags, &mut out);
+        }
+        out
+    }
+
+    // The contract for HashChunksFn doesn't require the implementation to support single-chunk
+    // inputs. Instead we handle that case here by calling compress in a loop.
+    #[inline]
+    fn hash_one_chunk(
+        &self,
+        mut input: &[u8],
+        key: &CVBytes,
+        counter: u64,
+        mut flags: u32,
+        output: TransposedSplit,
+    ) {
+        debug_assert!(input.len() <= CHUNK_LEN);
+        let mut cv = *key;
+        flags |= CHUNK_START;
+        while input.len() > BLOCK_LEN {
+            cv = self.compress(
+                input[..BLOCK_LEN].try_into().unwrap(),
+                BLOCK_LEN as u32,
+                &cv,
+                counter,
+                flags,
+            );
+            input = &input[BLOCK_LEN..];
+            flags &= !CHUNK_START;
+        }
+        let mut final_block = [0u8; BLOCK_LEN];
+        final_block[..input.len()].copy_from_slice(input);
+        cv = self.compress(
+            &final_block,
+            input.len() as u32,
+            &cv,
+            counter,
+            flags | CHUNK_END,
+        );
+        unsafe {
+            write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr);
+        }
+    }
+
+    #[inline]
+    fn hash_chunks_fn(&self) -> HashChunksFn {
+        unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn hash_chunks(
+        &self,
+        input: &[u8],
+        key: &CVBytes,
+        counter: u64,
+        flags: u32,
+        transposed_output: TransposedSplit,
+    ) -> usize {
+        debug_assert!(input.len() <= self.degree() * CHUNK_LEN);
+        if input.len() <= CHUNK_LEN {
+            // The underlying hash_chunks_fn isn't required to support this case. Instead we handle
+            // it by calling compress_fn in a loop. But note that we still don't support root
+            // finalization or the empty input here.
+            self.hash_one_chunk(input, key, counter, flags, transposed_output);
+            return 1;
+        }
+        // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently
+        // ignore the remainder. This makes it impossible to write out of bounds in a properly
+        // constructed TransposedSplit.
+        let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN);
+        unsafe {
+            self.hash_chunks_fn()(
+                input.as_ptr(),
+                len,
+                key,
+                counter,
+                flags,
+                transposed_output.ptr,
+            );
+        }
+        if input.len() % CHUNK_LEN == 0 {
+            input.len() / CHUNK_LEN
+        } else {
+            (input.len() / CHUNK_LEN) + 1
+        }
+    }
+
+    #[inline]
+    fn hash_parents_fn(&self) -> HashParentsFn {
+        unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn hash_parents(
+        &self,
+        transposed_input: &TransposedVectors,
+        mut num_cvs: usize,
+        key: &CVBytes,
+        flags: u32,
+        transposed_output: TransposedSplit,
+    ) -> usize {
+        debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
+        // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
+        num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
+        let mut odd_cv = [0u32; 8];
+        if num_cvs % 2 == 1 {
+            unsafe {
+                odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1));
+            }
+        }
+        let num_parents = num_cvs / 2;
+        unsafe {
+            self.hash_parents_fn()(
+                transposed_input.as_ptr(),
+                num_parents,
+                key,
+                flags | PARENT,
+                transposed_output.ptr,
+            );
+        }
+        if num_cvs % 2 == 1 {
+            unsafe {
+                write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents));
+            }
+            num_parents + 1
+        } else {
+            num_parents
+        }
+    }
+
+    #[inline]
+    pub fn reduce_parents(
+        &self,
+        transposed_in_out: &mut TransposedVectors,
+        mut num_cvs: usize,
+        key: &CVBytes,
+        flags: u32,
+    ) -> usize {
+        debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE);
+        // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses.
+        num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE);
+        let in_out_ptr = transposed_in_out.as_mut_ptr();
+        let mut odd_cv = [0u32; 8];
+        if num_cvs % 2 == 1 {
+            unsafe {
+                odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1));
+            }
+        }
+        let num_parents = num_cvs / 2;
+        unsafe {
+            self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr);
+        }
+        if num_cvs % 2 == 1 {
+            unsafe {
+                write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents));
+            }
+            num_parents + 1
+        } else {
+            num_parents
+        }
+    }
+
+    #[inline]
+    fn xof_fn(&self) -> XofFn {
+        unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn xof(
+        &self,
+        block: &BlockBytes,
+        block_len: u32,
+        cv: &CVBytes,
+        mut counter: u64,
+        flags: u32,
+        mut out: &mut [u8],
+    ) {
+        let degree = self.degree();
+        let simd_len = degree * BLOCK_LEN;
+        while !out.is_empty() {
+            let take = cmp::min(simd_len, out.len());
+            unsafe {
+                self.xof_fn()(
+                    block,
+                    block_len,
+                    cv,
+                    counter,
+                    flags | ROOT,
+                    out.as_mut_ptr(),
+                    take,
+                );
+            }
+            out = &mut out[take..];
+            counter += degree as u64;
+        }
+    }
+
+    #[inline]
+    fn xof_xor_fn(&self) -> XofFn {
+        unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn xof_xor(
+        &self,
+        block: &BlockBytes,
+        block_len: u32,
+        cv: &CVBytes,
+        mut counter: u64,
+        flags: u32,
+        mut out: &mut [u8],
+    ) {
+        let degree = self.degree();
+        let simd_len = degree * BLOCK_LEN;
+        while !out.is_empty() {
+            let take = cmp::min(simd_len, out.len());
+            unsafe {
+                self.xof_xor_fn()(
+                    block,
+                    block_len,
+                    cv,
+                    counter,
+                    flags | ROOT,
+                    out.as_mut_ptr(),
+                    take,
+                );
+            }
+            out = &mut out[take..];
+            counter += degree as u64;
+        }
+    }
+
+    #[inline]
+    fn universal_hash_fn(&self) -> UniversalHashFn {
+        unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) }
+    }
+
+    #[inline]
+    pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] {
+        let degree = self.degree();
+        let simd_len = degree * BLOCK_LEN;
+        let mut ret = [0u8; 16];
+        while !input.is_empty() {
+            let take = cmp::min(simd_len, input.len());
+            let mut output = [0u8; 16];
+            unsafe {
+                self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output);
+            }
+            input = &input[take..];
+            counter += degree as u64;
+            for byte_index in 0..16 {
+                ret[byte_index] ^= output[byte_index];
+            }
+        }
+        ret
+    }
+}
+
+impl Clone for Implementation {
+    fn clone(&self) -> Self {
+        Self {
+            degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)),
+            compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)),
+            hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)),
+            hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)),
+            xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)),
+            xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)),
+            universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)),
+        }
+    }
+}
+
+// never less than 2
+type DegreeFn = unsafe extern "C" fn() -> usize;
+
+unsafe extern "C" fn degree_init() -> usize {
+    init_detected_impl();
+    DETECTED_IMPL.degree_fn()()
+}
+
+type CompressFn = unsafe extern "C" fn(
+    block: *const BlockBytes, // zero padded to 64 bytes
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut CVBytes, // may overlap the input
+);
+
+unsafe extern "C" fn compress_init(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut CVBytes,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out);
+}
+
+type CompressXofFn = unsafe extern "C" fn(
+    block: *const BlockBytes, // zero padded to 64 bytes
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut BlockBytes, // may overlap the input
+);
+
+type HashChunksFn = unsafe extern "C" fn(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    transposed_output: *mut u32,
+);
+
+unsafe extern "C" fn hash_chunks_init(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    transposed_output: *mut u32,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output);
+}
+
+type HashParentsFn = unsafe extern "C" fn(
+    transposed_input: *const u32,
+    num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    transposed_output: *mut u32, // may overlap the input
+);
+
+unsafe extern "C" fn hash_parents_init(
+    transposed_input: *const u32,
+    num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    transposed_output: *mut u32,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output);
+}
+
+// This signature covers both xof() and xof_xor().
+type XofFn = unsafe extern "C" fn(
+    block: *const BlockBytes, // zero padded to 64 bytes
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+);
+
+unsafe extern "C" fn xof_init(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len);
+}
+
+unsafe extern "C" fn xof_xor_init(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    init_detected_impl();
+    DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len);
+}
+
+type UniversalHashFn = unsafe extern "C" fn(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    out: *mut [u8; 16],
+);
+
+unsafe extern "C" fn universal_hash_init(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    out: *mut [u8; 16],
+) {
+    init_detected_impl();
+    DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out);
+}
+
+// The implicit degree of this implementation is MAX_SIMD_DEGREE.
+#[inline(always)]
+unsafe fn hash_chunks_using_compress(
+    compress: CompressFn,
+    mut input: *const u8,
+    mut input_len: usize,
+    key: *const CVBytes,
+    mut counter: u64,
+    flags: u32,
+    mut transposed_output: *mut u32,
+) {
+    debug_assert!(input_len > 0);
+    debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN);
+    input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN);
+    while input_len > 0 {
+        let mut chunk_len = cmp::min(input_len, CHUNK_LEN);
+        input_len -= chunk_len;
+        // We only use 8 words of the CV, but compress returns 16.
+        let mut cv = *key;
+        let cv_ptr: *mut CVBytes = &mut cv;
+        let mut chunk_flags = flags | CHUNK_START;
+        while chunk_len > BLOCK_LEN {
+            compress(
+                input as *const BlockBytes,
+                BLOCK_LEN as u32,
+                cv_ptr,
+                counter,
+                chunk_flags,
+                cv_ptr,
+            );
+            input = input.add(BLOCK_LEN);
+            chunk_len -= BLOCK_LEN;
+            chunk_flags &= !CHUNK_START;
+        }
+        let mut last_block = [0u8; BLOCK_LEN];
+        ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len);
+        input = input.add(chunk_len);
+        compress(
+            &last_block,
+            chunk_len as u32,
+            cv_ptr,
+            counter,
+            chunk_flags | CHUNK_END,
+            cv_ptr,
+        );
+        let cv_words = words_from_le_bytes_32(&cv);
+        for word_index in 0..8 {
+            transposed_output
+                .add(word_index * TRANSPOSED_STRIDE)
+                .write(cv_words[word_index]);
+        }
+        transposed_output = transposed_output.add(1);
+        counter += 1;
+    }
+}
+
+// The implicit degree of this implementation is MAX_SIMD_DEGREE.
+#[inline(always)]
+unsafe fn hash_parents_using_compress(
+    compress: CompressFn,
+    mut transposed_input: *const u32,
+    mut num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    mut transposed_output: *mut u32, // may overlap the input
+) {
+    debug_assert!(num_parents > 0);
+    debug_assert!(num_parents <= MAX_SIMD_DEGREE);
+    while num_parents > 0 {
+        let mut block_bytes = [0u8; 64];
+        for word_index in 0..8 {
+            let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read();
+            block_bytes[WORD_LEN * word_index..][..WORD_LEN]
+                .copy_from_slice(&left_child_word.to_le_bytes());
+            let right_child_word = transposed_input
+                .add(word_index * TRANSPOSED_STRIDE + 1)
+                .read();
+            block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN]
+                .copy_from_slice(&right_child_word.to_le_bytes());
+        }
+        let mut cv = [0u8; 32];
+        compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv);
+        let cv_words = words_from_le_bytes_32(&cv);
+        for word_index in 0..8 {
+            transposed_output
+                .add(word_index * TRANSPOSED_STRIDE)
+                .write(cv_words[word_index]);
+        }
+        transposed_input = transposed_input.add(2);
+        transposed_output = transposed_output.add(1);
+        num_parents -= 1;
+    }
+}
+
+#[inline(always)]
+unsafe fn xof_using_compress_xof(
+    compress_xof: CompressXofFn,
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    mut counter: u64,
+    flags: u32,
+    mut out: *mut u8,
+    mut out_len: usize,
+) {
+    debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
+    while out_len > 0 {
+        let mut block_output = [0u8; 64];
+        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
+        let take = cmp::min(out_len, BLOCK_LEN);
+        ptr::copy_nonoverlapping(block_output.as_ptr(), out, take);
+        out = out.add(take);
+        out_len -= take;
+        counter += 1;
+    }
+}
+
+#[inline(always)]
+unsafe fn xof_xor_using_compress_xof(
+    compress_xof: CompressXofFn,
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    mut counter: u64,
+    flags: u32,
+    mut out: *mut u8,
+    mut out_len: usize,
+) {
+    debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN);
+    while out_len > 0 {
+        let mut block_output = [0u8; 64];
+        compress_xof(block, block_len, cv, counter, flags, &mut block_output);
+        let take = cmp::min(out_len, BLOCK_LEN);
+        for i in 0..take {
+            *out.add(i) ^= block_output[i];
+        }
+        out = out.add(take);
+        out_len -= take;
+        counter += 1;
+    }
+}
+
+#[inline(always)]
+unsafe fn universal_hash_using_compress(
+    compress: CompressFn,
+    mut input: *const u8,
+    mut input_len: usize,
+    key: *const CVBytes,
+    mut counter: u64,
+    out: *mut [u8; 16],
+) {
+    let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT;
+    let mut result = [0u8; 16];
+    while input_len > 0 {
+        let block_len = cmp::min(input_len, BLOCK_LEN);
+        let mut block = [0u8; BLOCK_LEN];
+        ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len);
+        let mut block_output = [0u8; 32];
+        compress(
+            &block,
+            block_len as u32,
+            key,
+            counter,
+            flags,
+            &mut block_output,
+        );
+        for i in 0..16 {
+            result[i] ^= block_output[i];
+        }
+        input = input.add(block_len);
+        input_len -= block_len;
+        counter += 1;
+    }
+    *out = result;
+}
+
+// this is in units of *words*, for pointer operations on *const/*mut u32
+const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE;
+
+#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]);
+
+impl TransposedVectors {
+    pub fn new() -> Self {
+        Self([[0; 2 * MAX_SIMD_DEGREE]; 8])
+    }
+
+    pub fn extract_cv(&self, cv_index: usize) -> CVBytes {
+        let mut words = [0u32; 8];
+        for word_index in 0..8 {
+            words[word_index] = self.0[word_index][cv_index];
+        }
+        le_bytes_from_words_32(&words)
+    }
+
+    pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes {
+        let mut bytes = [0u8; 64];
+        bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2));
+        bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1));
+        bytes
+    }
+
+    fn as_ptr(&self) -> *const u32 {
+        self.0[0].as_ptr()
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut u32 {
+        self.0[0].as_mut_ptr()
+    }
+
+    // SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not
+    // necessarily correct) to write up to `degree` words to either side of the split, possibly
+    // from different threads.
+    unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) {
+        debug_assert!(degree > 0);
+        debug_assert!(degree <= MAX_SIMD_DEGREE);
+        debug_assert_eq!(degree.count_ones(), 1, "power of 2");
+        let ptr = self.as_mut_ptr();
+        let left = TransposedSplit {
+            ptr,
+            phantom_data: PhantomData,
+        };
+        let right = TransposedSplit {
+            ptr: ptr.wrapping_add(degree),
+            phantom_data: PhantomData,
+        };
+        (left, right)
+    }
+}
+
+pub struct TransposedSplit<'vectors> {
+    ptr: *mut u32,
+    phantom_data: PhantomData<&'vectors mut u32>,
+}
+
+unsafe impl<'vectors> Send for TransposedSplit<'vectors> {}
+unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {}
+
+unsafe fn read_transposed_cv(src: *const u32) -> CVWords {
+    let mut cv = [0u32; 8];
+    for word_index in 0..8 {
+        let offset_words = word_index * TRANSPOSED_STRIDE;
+        cv[word_index] = src.add(offset_words).read();
+    }
+    cv
+}
+
+unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) {
+    for word_index in 0..8 {
+        let offset_words = word_index * TRANSPOSED_STRIDE;
+        dest.add(offset_words).write(cv[word_index]);
+    }
+}
+
+#[inline(always)]
+pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes {
+    let mut bytes = [0u8; 32];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < bytes.len() / WORD_LEN {
+        let word_bytes = words[word_index].to_le_bytes();
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
+            byte_index += 1;
+        }
+        word_index += 1;
+    }
+    bytes
+}
+
+#[inline(always)]
+pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes {
+    let mut bytes = [0u8; 64];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < bytes.len() / WORD_LEN {
+        let word_bytes = words[word_index].to_le_bytes();
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index];
+            byte_index += 1;
+        }
+        word_index += 1;
+    }
+    bytes
+}
+
+#[inline(always)]
+pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords {
+    let mut words = [0u32; 8];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < words.len() {
+        let mut word_bytes = [0u8; WORD_LEN];
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
+            byte_index += 1;
+        }
+        words[word_index] = u32::from_le_bytes(word_bytes);
+        word_index += 1;
+    }
+    words
+}
+
+#[inline(always)]
+pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords {
+    let mut words = [0u32; 16];
+    // This loop is super verbose because currently that's what it takes to be const.
+    let mut word_index = 0;
+    while word_index < words.len() {
+        let mut word_bytes = [0u8; WORD_LEN];
+        let mut byte_index = 0;
+        while byte_index < WORD_LEN {
+            word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index];
+            byte_index += 1;
+        }
+        words[word_index] = u32::from_le_bytes(word_bytes);
+        word_index += 1;
+    }
+    words
+}
+
+#[test]
+fn test_byte_word_round_trips() {
+    let cv = *b"This is 32 LE bytes/eight words.";
+    assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv)));
+    let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words.";
+    assert_eq!(
+        block,
+        le_bytes_from_words_64(&words_from_le_bytes_64(&block)),
+    );
+}
+
+// The largest power of two less than or equal to `n`, used for left_len()
+// immediately below, and also directly in Hasher::update().
+pub fn largest_power_of_two_leq(n: usize) -> usize {
+    ((n / 2) + 1).next_power_of_two()
+}
+
+#[test]
+fn test_largest_power_of_two_leq() {
+    let input_output = &[
+        // The zero case is nonsensical, but it does work.
+        (0, 1),
+        (1, 1),
+        (2, 2),
+        (3, 2),
+        (4, 4),
+        (5, 4),
+        (6, 4),
+        (7, 4),
+        (8, 8),
+        // the largest possible usize
+        (usize::MAX, (usize::MAX >> 1) + 1),
+    ];
+    for &(input, output) in input_output {
+        assert_eq!(
+            output,
+            crate::largest_power_of_two_leq(input),
+            "wrong output for n={}",
+            input
+        );
+    }
+}
+
+// Given some input larger than one chunk, return the number of bytes that
+// should go in the left subtree. This is the largest power-of-2 number of
+// chunks that leaves at least 1 byte for the right subtree.
+pub fn left_len(content_len: usize) -> usize {
+    debug_assert!(content_len > CHUNK_LEN);
+    // Subtract 1 to reserve at least one byte for the right side.
+    let full_chunks = (content_len - 1) / CHUNK_LEN;
+    largest_power_of_two_leq(full_chunks) * CHUNK_LEN
+}
+
+#[test]
+fn test_left_len() {
+    let input_output = &[
+        (CHUNK_LEN + 1, CHUNK_LEN),
+        (2 * CHUNK_LEN - 1, CHUNK_LEN),
+        (2 * CHUNK_LEN, CHUNK_LEN),
+        (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
+        (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
+        (4 * CHUNK_LEN, 2 * CHUNK_LEN),
+        (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
+    ];
+    for &(input, output) in input_output {
+        assert_eq!(left_len(input), output);
+    }
+}
diff --git a/third-party/blake3/rust/guts/src/portable.rs b/third-party/blake3/rust/guts/src/portable.rs
new file mode 100644
index 00000000..d5976440
--- /dev/null
+++ b/third-party/blake3/rust/guts/src/portable.rs
@@ -0,0 +1,262 @@
+use crate::{
+    le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64,
+    BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE,
+};
+
+const DEGREE: usize = MAX_SIMD_DEGREE;
+
+unsafe extern "C" fn degree() -> usize {
+    DEGREE
+}
+
+#[inline(always)]
+fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
+    state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
+    state[d] = (state[d] ^ state[a]).rotate_right(16);
+    state[c] = state[c].wrapping_add(state[d]);
+    state[b] = (state[b] ^ state[c]).rotate_right(12);
+    state[a] = state[a].wrapping_add(state[b]).wrapping_add(y);
+    state[d] = (state[d] ^ state[a]).rotate_right(8);
+    state[c] = state[c].wrapping_add(state[d]);
+    state[b] = (state[b] ^ state[c]).rotate_right(7);
+}
+
+#[inline(always)]
+fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) {
+    // Select the message schedule based on the round.
+    let schedule = MSG_SCHEDULE[round];
+
+    // Mix the columns.
+    g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+    g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+    g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+    g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+    // Mix the diagonals.
+    g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+    g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+    g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+    g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+#[inline(always)]
+fn compress_inner(
+    block_words: &BlockWords,
+    block_len: u32,
+    cv_words: &CVWords,
+    counter: u64,
+    flags: u32,
+) -> [u32; 16] {
+    let mut state = [
+        cv_words[0],
+        cv_words[1],
+        cv_words[2],
+        cv_words[3],
+        cv_words[4],
+        cv_words[5],
+        cv_words[6],
+        cv_words[7],
+        IV[0],
+        IV[1],
+        IV[2],
+        IV[3],
+        counter as u32,
+        (counter >> 32) as u32,
+        block_len as u32,
+        flags as u32,
+    ];
+    for round_number in 0..7 {
+        round(&mut state, &block_words, round_number);
+    }
+    state
+}
+
+pub(crate) unsafe extern "C" fn compress(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut CVBytes,
+) {
+    let block_words = words_from_le_bytes_64(&*block);
+    let cv_words = words_from_le_bytes_32(&*cv);
+    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
+    for word_index in 0..8 {
+        state[word_index] ^= state[word_index + 8];
+    }
+    *out = le_bytes_from_words_32(state[..8].try_into().unwrap());
+}
+
+pub(crate) unsafe extern "C" fn compress_xof(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut BlockBytes,
+) {
+    let block_words = words_from_le_bytes_64(&*block);
+    let cv_words = words_from_le_bytes_32(&*cv);
+    let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags);
+    for word_index in 0..8 {
+        state[word_index] ^= state[word_index + 8];
+        state[word_index + 8] ^= cv_words[word_index];
+    }
+    *out = le_bytes_from_words_64(&state);
+}
+
+pub(crate) unsafe extern "C" fn hash_chunks(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    transposed_output: *mut u32,
+) {
+    crate::hash_chunks_using_compress(
+        compress,
+        input,
+        input_len,
+        key,
+        counter,
+        flags,
+        transposed_output,
+    )
+}
+
+pub(crate) unsafe extern "C" fn hash_parents(
+    transposed_input: *const u32,
+    num_parents: usize,
+    key: *const CVBytes,
+    flags: u32,
+    transposed_output: *mut u32, // may overlap the input
+) {
+    crate::hash_parents_using_compress(
+        compress,
+        transposed_input,
+        num_parents,
+        key,
+        flags,
+        transposed_output,
+    )
+}
+
+pub(crate) unsafe extern "C" fn xof(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    crate::xof_using_compress_xof(
+        compress_xof,
+        block,
+        block_len,
+        cv,
+        counter,
+        flags,
+        out,
+        out_len,
+    )
+}
+
+pub(crate) unsafe extern "C" fn xof_xor(
+    block: *const BlockBytes,
+    block_len: u32,
+    cv: *const CVBytes,
+    counter: u64,
+    flags: u32,
+    out: *mut u8,
+    out_len: usize,
+) {
+    crate::xof_xor_using_compress_xof(
+        compress_xof,
+        block,
+        block_len,
+        cv,
+        counter,
+        flags,
+        out,
+        out_len,
+    )
+}
+
+pub(crate) unsafe extern "C" fn universal_hash(
+    input: *const u8,
+    input_len: usize,
+    key: *const CVBytes,
+    counter: u64,
+    out: *mut [u8; 16],
+) {
+    crate::universal_hash_using_compress(compress, input, input_len, key, counter, out)
+}
+
+pub fn implementation() -> Implementation {
+    Implementation::new(
+        degree,
+        compress,
+        hash_chunks,
+        hash_parents,
+        xof,
+        xof_xor,
+        universal_hash,
+    )
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_compress_vs_portable() {
+        crate::test::test_compress_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_compress_vs_reference() {
+        crate::test::test_compress_vs_reference(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_hash_chunks_vs_portable() {
+        crate::test::test_hash_chunks_vs_portable(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_hash_parents_vs_portable() {
+        crate::test::test_hash_parents_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_chunks_and_parents_vs_reference() {
+        crate::test::test_chunks_and_parents_vs_reference(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_xof_vs_portable() {
+        crate::test::test_xof_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_xof_vs_reference() {
+        crate::test::test_xof_vs_reference(&implementation());
+    }
+
+    // This is circular but do it anyway.
+    #[test]
+    fn test_universal_hash_vs_portable() {
+        crate::test::test_universal_hash_vs_portable(&implementation());
+    }
+
+    #[test]
+    fn test_universal_hash_vs_reference() {
+        crate::test::test_universal_hash_vs_reference(&implementation());
+    }
+}
diff --git a/third-party/blake3/rust/guts/src/test.rs b/third-party/blake3/rust/guts/src/test.rs
new file mode 100644
index 00000000..83bd790c
--- /dev/null
+++ b/third-party/blake3/rust/guts/src/test.rs
@@ -0,0 +1,523 @@
+use crate::*;
+
+pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
+
+// Test a few different initial counter values.
+// - 0: The base case.
+// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when
+//   you're supposed to ANDNOT.
+// - u32::MAX: The low word of the counter overflows for all inputs except the first.
+// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word.
+const INITIAL_COUNTERS: [u64; 4] = [
+    0,
+    i32::MAX as u64,
+    u32::MAX as u64,
+    (42u64 << 32) + u32::MAX as u64,
+];
+
+const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64];
+
+pub fn paint_test_input(buf: &mut [u8]) {
+    for (i, b) in buf.iter_mut().enumerate() {
+        *b = (i % 251) as u8;
+    }
+}
+
+pub fn test_compress_vs_portable(test_impl: &Implementation) {
+    for block_len in BLOCK_LENGTHS {
+        dbg!(block_len);
+        let mut block = [0; BLOCK_LEN];
+        paint_test_input(&mut block[..block_len]);
+        for counter in INITIAL_COUNTERS {
+            dbg!(counter);
+            let portable_cv = portable::implementation().compress(
+                &block,
+                block_len as u32,
+                &TEST_KEY,
+                counter,
+                KEYED_HASH,
+            );
+
+            let test_cv =
+                test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH);
+
+            assert_eq!(portable_cv, test_cv);
+        }
+    }
+}
+
+pub fn test_compress_vs_reference(test_impl: &Implementation) {
+    for block_len in BLOCK_LENGTHS {
+        dbg!(block_len);
+        let mut block = [0; BLOCK_LEN];
+        paint_test_input(&mut block[..block_len]);
+
+        let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+        ref_hasher.update(&block[..block_len]);
+        let mut ref_hash = [0u8; 32];
+        ref_hasher.finalize(&mut ref_hash);
+
+        let test_cv = test_impl.compress(
+            &block,
+            block_len as u32,
+            &TEST_KEY,
+            0,
+            CHUNK_START | CHUNK_END | ROOT | KEYED_HASH,
+        );
+
+        assert_eq!(ref_hash, test_cv);
+    }
+}
+
+fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) {
+    if output_a == output_b {
+        return;
+    }
+    for cv_index in 0..2 * MAX_SIMD_DEGREE {
+        let cv_a = output_a.extract_cv(cv_index);
+        let cv_b = output_b.extract_cv(cv_index);
+        if cv_a == [0; 32] && cv_b == [0; 32] {
+            println!("CV {cv_index:2} empty");
+        } else if cv_a == cv_b {
+            println!("CV {cv_index:2} matches");
+        } else {
+            println!("CV {cv_index:2} mismatch:");
+            println!("    {}", hex::encode(cv_a));
+            println!("    {}", hex::encode(cv_b));
+        }
+    }
+    panic!("transposed outputs are not equal");
+}
+
+pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) {
+    assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
+    dbg!(test_impl.degree() * CHUNK_LEN);
+    // Allocate 4 extra bytes of padding so we can make aligned slices.
+    let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4];
+    let mut input_slice = &mut input_buf[..];
+    // Make sure the start of the input is word-aligned.
+    while input_slice.as_ptr() as usize % 4 != 0 {
+        input_slice = &mut input_slice[1..];
+    }
+    let (aligned_input, mut unaligned_input) =
+        input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN);
+    unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN];
+    assert_eq!(aligned_input.as_ptr() as usize % 4, 0);
+    assert_eq!(unaligned_input.as_ptr() as usize % 4, 1);
+    paint_test_input(aligned_input);
+    paint_test_input(unaligned_input);
+    // Try just below, equal to, and just above every whole number of chunks.
+    let mut input_2_lengths = Vec::new();
+    let mut next_len = 2 * CHUNK_LEN;
+    loop {
+        // 95 is one whole block plus one interesting part of another
+        input_2_lengths.push(next_len - 95);
+        input_2_lengths.push(next_len);
+        if next_len == test_impl.degree() * CHUNK_LEN {
+            break;
+        }
+        input_2_lengths.push(next_len + 95);
+        next_len += CHUNK_LEN;
+    }
+    for input_2_len in input_2_lengths {
+        dbg!(input_2_len);
+        let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN];
+        let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
+        let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN];
+        let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len];
+        for initial_counter in INITIAL_COUNTERS {
+            dbg!(initial_counter);
+            // Make two calls, to test the output_column parameter.
+            let mut portable_output = TransposedVectors::new();
+            let (portable_left, portable_right) =
+                test_impl.split_transposed_vectors(&mut portable_output);
+            portable::implementation().hash_chunks(
+                aligned_input1,
+                &IV_BYTES,
+                initial_counter,
+                0,
+                portable_left,
+            );
+            portable::implementation().hash_chunks(
+                aligned_input2,
+                &TEST_KEY,
+                initial_counter + test_impl.degree() as u64,
+                KEYED_HASH,
+                portable_right,
+            );
+
+            let mut test_output = TransposedVectors::new();
+            let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
+            test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left);
+            test_impl.hash_chunks(
+                aligned_input2,
+                &TEST_KEY,
+                initial_counter + test_impl.degree() as u64,
+                KEYED_HASH,
+                test_right,
+            );
+            check_transposed_eq(&portable_output, &test_output);
+
+            // Do the same thing with unaligned input.
+            let mut unaligned_test_output = TransposedVectors::new();
+            let (unaligned_left, unaligned_right) =
+                test_impl.split_transposed_vectors(&mut unaligned_test_output);
+            test_impl.hash_chunks(
+                unaligned_input1,
+                &IV_BYTES,
+                initial_counter,
+                0,
+                unaligned_left,
+            );
+            test_impl.hash_chunks(
+                unaligned_input2,
+                &TEST_KEY,
+                initial_counter + test_impl.degree() as u64,
+                KEYED_HASH,
+                unaligned_right,
+            );
+            check_transposed_eq(&portable_output, &unaligned_test_output);
+        }
+    }
+}
+
+fn painted_transposed_input() -> TransposedVectors {
+    let mut vectors = TransposedVectors::new();
+    let mut val = 0;
+    for col in 0..2 * MAX_SIMD_DEGREE {
+        for row in 0..8 {
+            vectors.0[row][col] = val;
+            val += 1;
+        }
+    }
+    vectors
+}
+
+pub fn test_hash_parents_vs_portable(test_impl: &Implementation) {
+    assert!(test_impl.degree() <= MAX_SIMD_DEGREE);
+    let input = painted_transposed_input();
+    for num_parents in 2..=(test_impl.degree() / 2) {
+        dbg!(num_parents);
+        let mut portable_output = TransposedVectors::new();
+        let (portable_left, portable_right) =
+            test_impl.split_transposed_vectors(&mut portable_output);
+        portable::implementation().hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &IV_BYTES,
+            0,
+            portable_left,
+        );
+        portable::implementation().hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &TEST_KEY,
+            KEYED_HASH,
+            portable_right,
+        );
+
+        let mut test_output = TransposedVectors::new();
+        let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output);
+        test_impl.hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &IV_BYTES,
+            0,
+            test_left,
+        );
+        test_impl.hash_parents(
+            &input,
+            2 * num_parents, // num_cvs
+            &TEST_KEY,
+            KEYED_HASH,
+            test_right,
+        );
+
+        check_transposed_eq(&portable_output, &test_output);
+    }
+}
+
+fn hash_with_chunks_and_parents_recurse(
+    test_impl: &Implementation,
+    input: &[u8],
+    counter: u64,
+    output: TransposedSplit,
+) -> usize {
+    assert!(input.len() > 0);
+    if input.len() <= test_impl.degree() * CHUNK_LEN {
+        return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output);
+    }
+    let (left_input, right_input) = input.split_at(left_len(input.len()));
+    let mut child_output = TransposedVectors::new();
+    let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output);
+    let mut children =
+        hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output);
+    assert_eq!(children, test_impl.degree());
+    children += hash_with_chunks_and_parents_recurse(
+        test_impl,
+        right_input,
+        counter + (left_input.len() / CHUNK_LEN) as u64,
+        right_output,
+    );
+    test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output)
+}
+
+// Note: This test implementation doesn't support the 1-chunk-or-less case.
+fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes {
+    // TODO: handle the 1-chunk case?
+    assert!(input.len() > CHUNK_LEN);
+    let mut cvs = TransposedVectors::new();
+    // The right half of these vectors are never used.
+    let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs);
+    let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left);
+    while num_cvs > 2 {
+        num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0);
+    }
+    test_impl.compress(
+        &cvs.extract_parent_node(0),
+        BLOCK_LEN as u32,
+        &IV_BYTES,
+        0,
+        PARENT | ROOT,
+    )
+}
+
+pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) {
+    assert_eq!(test_impl.degree().count_ones(), 1, "power of 2");
+    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN;
+    let mut input_buf = [0u8; MAX_INPUT_LEN];
+    paint_test_input(&mut input_buf);
+    // Try just below, equal to, and just above every whole number of chunks, except that
+    // root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case.
+    let mut test_lengths = vec![CHUNK_LEN + 1];
+    let mut next_len = 2 * CHUNK_LEN;
+    loop {
+        // 95 is one whole block plus one interesting part of another
+        test_lengths.push(next_len - 95);
+        test_lengths.push(next_len);
+        if next_len == MAX_INPUT_LEN {
+            break;
+        }
+        test_lengths.push(next_len + 95);
+        next_len += CHUNK_LEN;
+    }
+    for test_len in test_lengths {
+        dbg!(test_len);
+        let input = &input_buf[..test_len];
+
+        let mut ref_hasher = reference_impl::Hasher::new();
+        ref_hasher.update(&input);
+        let mut ref_hash = [0u8; 32];
+        ref_hasher.finalize(&mut ref_hash);
+
+        let test_hash = root_hash_with_chunks_and_parents(test_impl, input);
+
+        assert_eq!(ref_hash, test_hash);
+    }
+}
+
+pub fn test_xof_vs_portable(test_impl: &Implementation) {
+    let flags = CHUNK_START | CHUNK_END | KEYED_HASH;
+    for counter in INITIAL_COUNTERS {
+        dbg!(counter);
+        for input_len in [0, 1, BLOCK_LEN] {
+            dbg!(input_len);
+            let mut input_block = [0u8; BLOCK_LEN];
+            for byte_index in 0..input_len {
+                input_block[byte_index] = byte_index as u8 + 42;
+            }
+            // Try equal to and partway through every whole number of output blocks.
+            const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+            let mut output_lengths = Vec::new();
+            let mut next_len = 0;
+            loop {
+                output_lengths.push(next_len);
+                if next_len == MAX_OUTPUT_LEN {
+                    break;
+                }
+                output_lengths.push(next_len + 31);
+                next_len += BLOCK_LEN;
+            }
+            for output_len in output_lengths {
+                dbg!(output_len);
+                let mut portable_output = [0xff; MAX_OUTPUT_LEN];
+                portable::implementation().xof(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut portable_output[..output_len],
+                );
+                let mut test_output = [0xff; MAX_OUTPUT_LEN];
+                test_impl.xof(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut test_output[..output_len],
+                );
+                assert_eq!(portable_output, test_output);
+
+                // Double check that the implementation didn't overwrite.
+                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
+
+                // The first XOR cancels out the output.
+                test_impl.xof_xor(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut test_output[..output_len],
+                );
+                assert!(test_output[..output_len].iter().all(|&b| b == 0));
+                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
+
+                // The second XOR restores out the output.
+                test_impl.xof_xor(
+                    &input_block,
+                    input_len as u32,
+                    &TEST_KEY,
+                    counter,
+                    flags,
+                    &mut test_output[..output_len],
+                );
+                assert_eq!(portable_output, test_output);
+                assert!(test_output[output_len..].iter().all(|&b| b == 0xff));
+            }
+        }
+    }
+}
+
+pub fn test_xof_vs_reference(test_impl: &Implementation) {
+    let input = b"hello world";
+    let mut input_block = [0; BLOCK_LEN];
+    input_block[..input.len()].copy_from_slice(input);
+
+    const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+    let mut ref_output = [0; MAX_OUTPUT_LEN];
+    let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+    ref_hasher.update(input);
+    ref_hasher.finalize(&mut ref_output);
+
+    // Try equal to and partway through every whole number of output blocks.
+    let mut output_lengths = vec![0, 1, 31];
+    let mut next_len = BLOCK_LEN;
+    loop {
+        output_lengths.push(next_len);
+        if next_len == MAX_OUTPUT_LEN {
+            break;
+        }
+        output_lengths.push(next_len + 31);
+        next_len += BLOCK_LEN;
+    }
+
+    for output_len in output_lengths {
+        dbg!(output_len);
+        let mut test_output = [0; MAX_OUTPUT_LEN];
+        test_impl.xof(
+            &input_block,
+            input.len() as u32,
+            &TEST_KEY,
+            0,
+            KEYED_HASH | CHUNK_START | CHUNK_END,
+            &mut test_output[..output_len],
+        );
+        assert_eq!(ref_output[..output_len], test_output[..output_len]);
+
+        // Double check that the implementation didn't overwrite.
+        assert!(test_output[output_len..].iter().all(|&b| b == 0));
+
+        // Do it again starting from block 1.
+        if output_len >= BLOCK_LEN {
+            test_impl.xof(
+                &input_block,
+                input.len() as u32,
+                &TEST_KEY,
+                1,
+                KEYED_HASH | CHUNK_START | CHUNK_END,
+                &mut test_output[..output_len - BLOCK_LEN],
+            );
+            assert_eq!(
+                ref_output[BLOCK_LEN..output_len],
+                test_output[..output_len - BLOCK_LEN],
+            );
+        }
+    }
+}
+
+pub fn test_universal_hash_vs_portable(test_impl: &Implementation) {
+    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+    let mut input_buf = [0; MAX_INPUT_LEN];
+    paint_test_input(&mut input_buf);
+    // Try equal to and partway through every whole number of input blocks.
+    let mut input_lengths = vec![0, 1, 31];
+    let mut next_len = BLOCK_LEN;
+    loop {
+        input_lengths.push(next_len);
+        if next_len == MAX_INPUT_LEN {
+            break;
+        }
+        input_lengths.push(next_len + 31);
+        next_len += BLOCK_LEN;
+    }
+    for input_len in input_lengths {
+        dbg!(input_len);
+        for counter in INITIAL_COUNTERS {
+            dbg!(counter);
+            let portable_output = portable::implementation().universal_hash(
+                &input_buf[..input_len],
+                &TEST_KEY,
+                counter,
+            );
+            let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter);
+            assert_eq!(portable_output, test_output);
+        }
+    }
+}
+
+fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] {
+    // The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended
+    // output to seek to a block.
+    const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE;
+    assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS);
+    let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS];
+    let mut result = [0u8; UNIVERSAL_HASH_LEN];
+    let mut block_start = 0;
+    while block_start < input.len() {
+        let block_len = cmp::min(input.len() - block_start, BLOCK_LEN);
+        let mut ref_hasher = reference_impl::Hasher::new_keyed(key);
+        ref_hasher.update(&input[block_start..block_start + block_len]);
+        ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]);
+        for byte_index in 0..UNIVERSAL_HASH_LEN {
+            result[byte_index] ^= output_buffer[block_start + byte_index];
+        }
+        block_start += BLOCK_LEN;
+    }
+    result
+}
+
+pub fn test_universal_hash_vs_reference(test_impl: &Implementation) {
+    const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN;
+    let mut input_buf = [0; MAX_INPUT_LEN];
+    paint_test_input(&mut input_buf);
+    // Try equal to and partway through every whole number of input blocks.
+    let mut input_lengths = vec![0, 1, 31];
+    let mut next_len = BLOCK_LEN;
+    loop {
+        input_lengths.push(next_len);
+        if next_len == MAX_INPUT_LEN {
+            break;
+        }
+        input_lengths.push(next_len + 31);
+        next_len += BLOCK_LEN;
+    }
+    for input_len in input_lengths {
+        dbg!(input_len);
+        let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY);
+        let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0);
+        assert_eq!(ref_output, test_output);
+    }
+}
diff --git a/third-party/blake3/src/io.rs b/third-party/blake3/src/io.rs
new file mode 100644
index 00000000..1c19881e
--- /dev/null
+++ b/third-party/blake3/src/io.rs
@@ -0,0 +1,79 @@
+//! Helper functions for efficient IO.
+
+#[cfg(feature = "std")]
+pub(crate) fn copy_wide(
+    mut reader: impl std::io::Read,
+    hasher: &mut crate::Hasher,
+) -> std::io::Result<u64> {
+    let mut buffer = [0; 65536];
+    let mut total = 0;
+    loop {
+        match reader.read(&mut buffer) {
+            Ok(0) => return Ok(total),
+            Ok(n) => {
+                hasher.update(&buffer[..n]);
+                total += n as u64;
+            }
+            // see test_update_reader_interrupted
+            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
+// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
+// fails, return the error.
+//
+// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
+// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get
+// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because
+// this function is crate-private, we can guarantee that all can ever happen in the event of a race
+// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of
+// which should risk memory corruption in a safe caller.
+//
+// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no
+// platform in the "real world" is ever going to do anything other than compute the "wrong answer"
+// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this?
+// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a
+// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have
+// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do
+// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to
+// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert
+// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give
+// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's
+// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to
+// construct a safe &i32 to the register if you're going to leak that reference to unknown callers.
+// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally
+// different here. Feedback needed.
+#[cfg(feature = "mmap")]
+pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result<Option<memmap2::Mmap>> {
+    let metadata = file.metadata()?;
+    let file_size = metadata.len();
+    #[allow(clippy::if_same_then_else)]
+    if !metadata.is_file() {
+        // Not a real file.
+        Ok(None)
+    } else if file_size > isize::max_value() as u64 {
+        // Too long to safely map.
+        // https://github.com/danburkert/memmap-rs/issues/69
+        Ok(None)
+    } else if file_size == 0 {
+        // Mapping an empty file currently fails.
+        // https://github.com/danburkert/memmap-rs/issues/72
+        // See test_mmap_virtual_file.
+        Ok(None)
+    } else if file_size < 16 * 1024 {
+        // Mapping small files is not worth it.
+        Ok(None)
+    } else {
+        // Explicitly set the length of the memory map, so that filesystem
+        // changes can't race to violate the invariants we just checked.
+        let map = unsafe {
+            memmap2::MmapOptions::new()
+                .len(file_size as usize)
+                .map(file)?
+        };
+        Ok(Some(map))
+    }
+}
diff --git a/third-party/blake3/src/lib.rs b/third-party/blake3/src/lib.rs
index ac61fb27..d661cb2d 100644
--- a/third-party/blake3/src/lib.rs
+++ b/third-party/blake3/src/lib.rs
@@ -33,15 +33,33 @@
 //! # Cargo Features
 //!
 //! The `std` feature (the only feature enabled by default) is required for
-//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime
-//! CPU feature detection on x86. If this feature is disabled, the only way to
-//! use the x86 SIMD implementations is to enable the corresponding instruction
-//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting
-//! binary will not be portable to other machines.
+//! implementations of the [`Write`] and [`Seek`] traits, the
+//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU
+//! feature detection on x86. If this feature is disabled, the only way to use
+//! the x86 SIMD implementations is to enable the corresponding instruction sets
+//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary
+//! will not be portable to other machines.
 //!
 //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds
-//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However,
-//! even if this feature is enabled, all other APIs remain single-threaded.
+//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap`
+//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for
+//! multithreaded hashing. However, even if this feature is enabled, all other
+//! APIs remain single-threaded.
+//!
+//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the
+//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above)
+//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for
+//! memory-mapped IO.
+//!
+//! The `zeroize` feature (disabled by default, but enabled for [docs.rs])
+//! implements
+//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for
+//! this crate's types.
+//!
+//! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements
+//! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and
+//! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html)
+//! for [`Hash`](struct@Hash).
 //!
 //! The NEON implementation is enabled by default for AArch64 but requires the
 //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and
@@ -49,12 +67,12 @@
 //! without NEON support.
 //!
 //! The `traits-preview` feature enables implementations of traits from the
-//! RustCrypto [`digest`] crate, and re-exports that crate as
-//! `traits::digest`. However, the traits aren't stable, and they're expected to
-//! change in incompatible ways before that crate reaches 1.0. For that reason,
-//! this crate makes no SemVer guarantees for this feature, and callers who use
-//! it should expect breaking changes between patch versions. (The "-preview"
-//! feature name follows the conventions of the RustCrypto [`signature`] crate.)
+//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
+//! However, the traits aren't stable, and they're expected to change in
+//! incompatible ways before that crate reaches 1.0. For that reason, this crate
+//! makes no SemVer guarantees for this feature, and callers who use it should
+//! expect breaking changes between patch versions. (The "-preview" feature name
+//! follows the conventions of the RustCrypto [`signature`] crate.)
 //!
 //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon
 //! [BLAKE3]: https://blake3.io
@@ -112,6 +130,7 @@ mod sse41;
 #[cfg(feature = "traits-preview")]
 pub mod traits;
 
+mod io;
 mod join;
 
 use arrayref::{array_mut_ref, array_ref};
@@ -197,6 +216,8 @@ fn counter_high(counter: u64) -> u32 {
 /// [`from_hex`]: #method.from_hex
 /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html
 /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
 #[derive(Clone, Copy, Hash)]
 pub struct Hash([u8; OUT_LEN]);
 
@@ -284,10 +305,28 @@ impl core::str::FromStr for Hash {
     }
 }
 
+// A proper implementation of constant time equality is tricky, and we get it from the
+// constant_time_eq crate instead of rolling our own. However, that crate isn't compatible with
+// Miri, so we roll our own just for that.
+#[cfg(miri)]
+fn constant_time_eq_miri(a: &[u8], b: &[u8]) -> bool {
+    if a.len() != b.len() {
+        return false;
+    }
+    let mut x = 0;
+    for i in 0..a.len() {
+        x |= a[i] ^ b[i];
+    }
+    x == 0
+}
+
 /// This implementation is constant-time.
 impl PartialEq for Hash {
     #[inline]
     fn eq(&self, other: &Hash) -> bool {
+        #[cfg(miri)]
+        return constant_time_eq_miri(&self.0, &other.0);
+        #[cfg(not(miri))]
         constant_time_eq::constant_time_eq_32(&self.0, &other.0)
     }
 }
@@ -296,6 +335,9 @@ impl PartialEq for Hash {
 impl PartialEq<[u8; OUT_LEN]> for Hash {
     #[inline]
     fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
+        #[cfg(miri)]
+        return constant_time_eq_miri(&self.0, other);
+        #[cfg(not(miri))]
         constant_time_eq::constant_time_eq_32(&self.0, other)
     }
 }
@@ -304,6 +346,9 @@ impl PartialEq<[u8; OUT_LEN]> for Hash {
 impl PartialEq<[u8]> for Hash {
     #[inline]
     fn eq(&self, other: &[u8]) -> bool {
+        #[cfg(miri)]
+        return constant_time_eq_miri(&self.0, other);
+        #[cfg(not(miri))]
         constant_time_eq::constant_time_eq(&self.0, other)
     }
 }
@@ -371,6 +416,7 @@ impl std::error::Error for HexError {}
 // Each chunk or parent node can produce either a 32-byte chaining value or, by
 // setting the ROOT flag, any number of final output bytes. The Output struct
 // captures the state just prior to choosing between those two possibilities.
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 struct Output {
     input_chaining_value: CVWords,
@@ -378,6 +424,7 @@ struct Output {
     block_len: u8,
     counter: u64,
     flags: u8,
+    #[cfg_attr(feature = "zeroize", zeroize(skip))]
     platform: Platform,
 }
 
@@ -414,6 +461,7 @@ impl Output {
 }
 
 #[derive(Clone)]
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 struct ChunkState {
     cv: CVWords,
     chunk_counter: u64,
@@ -421,6 +469,7 @@ struct ChunkState {
     buf_len: u8,
     blocks_compressed: u8,
     flags: u8,
+    #[cfg_attr(feature = "zeroize", zeroize(skip))]
     platform: Platform,
 }
 
@@ -903,6 +952,9 @@ fn parent_node_output(
 
 /// An incremental hash state that can accept any number of writes.
 ///
+/// The `rayon` and `mmap` Cargo features enable additional methods on this
+/// type related to multithreading and memory-mapped IO.
+///
 /// When the `traits-preview` Cargo feature is enabled, this type implements
 /// several commonly used traits from the
 /// [`digest`](https://crates.io/crates/digest) crate. However, those
@@ -911,15 +963,6 @@ fn parent_node_output(
 /// guarantees for this feature, and callers who use it should expect breaking
 /// changes between patch versions.
 ///
-/// When the `rayon` Cargo feature is enabled, the
-/// [`update_rayon`](#method.update_rayon) method is available for multithreaded
-/// hashing.
-///
-/// **Performance note:** The [`update`](#method.update) method can't take full
-/// advantage of SIMD optimizations if its input buffer is too small or oddly
-/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently
-/// supported SIMD instruction sets.
-///
 /// # Examples
 ///
 /// ```
@@ -942,6 +985,7 @@ fn parent_node_output(
 /// # }
 /// ```
 #[derive(Clone)]
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 pub struct Hasher {
     key: CVWords,
     chunk_state: ChunkState,
@@ -1069,48 +1113,17 @@ impl Hasher {
         self.cv_stack.push(*new_cv);
     }
 
-    /// Add input bytes to the hash state. You can call this any number of
-    /// times.
+    /// Add input bytes to the hash state. You can call this any number of times.
     ///
     /// This method is always single-threaded. For multithreading support, see
-    /// [`update_rayon`](#method.update_rayon) below (enabled with the `rayon`
-    /// Cargo feature).
+    /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature).
     ///
-    /// Note that the degree of SIMD parallelism that `update` can use is
-    /// limited by the size of this input buffer. The 8 KiB buffer currently
-    /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but
-    /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to
-    /// leverage all currently supported SIMD instruction sets.
-    ///
-    /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html
+    /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of
+    /// this input buffer. See [`update_reader`](#method.update_reader).
     pub fn update(&mut self, input: &[u8]) -> &mut Self {
         self.update_with_join::<join::SerialJoin>(input)
     }
 
-    /// Identical to [`update`](Hasher::update), but using Rayon-based
-    /// multithreading internally.
-    ///
-    /// This method is gated by the `rayon` Cargo feature, which is disabled by
-    /// default but enabled on [docs.rs](https://docs.rs).
-    ///
-    /// To get any performance benefit from multithreading, the input buffer
-    /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
-    /// _slower_ than `update` for inputs under 128 KiB. That threshold varies
-    /// quite a lot across different processors, and it's important to benchmark
-    /// your specific use case.
-    ///
-    /// Memory mapping an entire input file is a simple way to take advantage of
-    /// multithreading without needing to carefully tune your buffer size or
-    /// offload IO. However, on spinning disks where random access is expensive,
-    /// that approach can lead to disk thrashing and terrible IO performance.
-    /// Note that OS page caching can mask this problem, in which case it might
-    /// only appear for files larger than available RAM. Again, benchmarking
-    /// your specific use case is important.
-    #[cfg(feature = "rayon")]
-    pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
-        self.update_with_join::<join::RayonJoin>(input)
-    }
-
     fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
         // If we have some partial chunk bytes in the internal chunk_state, we
         // need to finish that chunk first.
@@ -1309,6 +1322,182 @@ impl Hasher {
     pub fn count(&self) -> u64 {
         self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
     }
+
+    /// As [`update`](Hasher::update), but reading from a
+    /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation.
+    ///
+    /// [`Hasher`] implements
+    /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to
+    /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`]
+    /// from any reader. Unfortunately, this standard approach can limit performance, because
+    /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of
+    /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512)
+    /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly
+    /// more convenient.
+    ///
+    /// The internal buffer size this method uses may change at any time, and it may be different
+    /// for different targets. The only guarantee is that it will be large enough for all of this
+    /// crate's SIMD implementations on the current platform.
+    ///
+    /// The most common implementer of
+    /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be
+    /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory
+    /// mapping can be faster than this method for hashing large files. See
+    /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon),
+    /// which require the `mmap` and (for the latter) `rayon` Cargo features.
+    ///
+    /// This method requires the `std` Cargo feature, which is enabled by default.
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::fs::File;
+    /// # use std::io;
+    /// # fn main() -> io::Result<()> {
+    /// // Hash standard input.
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_reader(std::io::stdin().lock())?;
+    /// println!("{}", hasher.finalize());
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "std")]
+    pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> {
+        io::copy_wide(reader, self)?;
+        Ok(self)
+    }
+
+    /// As [`update`](Hasher::update), but using Rayon-based multithreading
+    /// internally.
+    ///
+    /// This method is gated by the `rayon` Cargo feature, which is disabled by
+    /// default but enabled on [docs.rs](https://docs.rs).
+    ///
+    /// To get any performance benefit from multithreading, the input buffer
+    /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
+    /// _slower_ than `update` for inputs under 128 KiB. That threshold varies
+    /// quite a lot across different processors, and it's important to benchmark
+    /// your specific use case. See also the performance warning associated with
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon).
+    ///
+    /// If you already have a large buffer in memory, and you want to hash it
+    /// with multiple threads, this method is a good option. However, reading a
+    /// file into memory just to call this method can be a performance mistake,
+    /// both because it requires lots of memory and because single-threaded
+    /// reads can be slow. For hashing whole files, see
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both
+    /// the `rayon` and `mmap` Cargo features.
+    #[cfg(feature = "rayon")]
+    pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
+        self.update_with_join::<join::RayonJoin>(input)
+    }
+
+    /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping.
+    ///
+    /// Not all files can be memory mapped, and memory mapping small files can be slower than
+    /// reading them the usual way. In those cases, this method will fall back to standard file IO.
+    /// The heuristic for whether to use memory mapping is currently very simple (file size >=
+    /// 16 KiB), and it might change at any time.
+    ///
+    /// Like [`update`](Hasher::update), this method is single-threaded. In this author's
+    /// experience, memory mapping improves single-threaded performance by ~10% for large files
+    /// that are already in cache. This probably varies between platforms, and as always it's a
+    /// good idea to benchmark your own use case. In comparison, the multithreaded
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on
+    /// performance.
+    ///
+    /// There's a correctness reason that this method takes
+    /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of
+    /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped
+    /// file ignores the seek position of the original file handle (it neither respects the current
+    /// position nor updates the position). This difference in behavior would've caused
+    /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and
+    /// have different side effects in some cases. Taking a
+    /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by
+    /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is
+    /// opened internally.
+    ///
+    /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on
+    /// [docs.rs](https://docs.rs).
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::io;
+    /// # use std::path::Path;
+    /// # fn main() -> io::Result<()> {
+    /// let path = Path::new("file.dat");
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_mmap(path)?;
+    /// println!("{}", hasher.finalize());
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "mmap")]
+    pub fn update_mmap(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<&mut Self> {
+        let file = std::fs::File::open(path.as_ref())?;
+        if let Some(mmap) = io::maybe_mmap_file(&file)? {
+            self.update(&mmap);
+        } else {
+            io::copy_wide(&file, self)?;
+        }
+        Ok(self)
+    }
+
+    /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using
+    /// memory mapping. This is the default behavior of `b3sum`.
+    ///
+    /// For large files that are likely to be in cache, this can be much faster than
+    /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other
+    /// cryptographic hashes, this is usually what they're measuring. However...
+    ///
+    /// **Performance Warning:** There are cases where multithreading hurts performance. The worst
+    /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31),
+    /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends
+    /// more time seeking around than reading data). Windows tends to be somewhat worse about this,
+    /// in part because it's less likely than Linux to keep very large files in cache. More
+    /// generally, if your CPU cores are already busy, then multithreading will add overhead
+    /// without improving performance. If your code runs in different environments that you don't
+    /// control and can't measure, then unfortunately there's no one-size-fits-all answer for
+    /// whether multithreading is a good idea.
+    ///
+    /// The memory mapping behavior of this function is the same as
+    /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard
+    /// file IO might change at any time.
+    ///
+    /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by
+    /// default but enabled on [docs.rs](https://docs.rs).
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::io;
+    /// # use std::path::Path;
+    /// # fn main() -> io::Result<()> {
+    /// # #[cfg(feature = "rayon")]
+    /// # {
+    /// let path = Path::new("big_file.dat");
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_mmap_rayon(path)?;
+    /// println!("{}", hasher.finalize());
+    /// # }
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "mmap")]
+    #[cfg(feature = "rayon")]
+    pub fn update_mmap_rayon(
+        &mut self,
+        path: impl AsRef<std::path::Path>,
+    ) -> std::io::Result<&mut Self> {
+        let file = std::fs::File::open(path.as_ref())?;
+        if let Some(mmap) = io::maybe_mmap_file(&file)? {
+            self.update_rayon(&mmap);
+        } else {
+            io::copy_wide(&file, self)?;
+        }
+        Ok(self)
+    }
 }
 
 // Don't derive(Debug), because the state may be secret.
@@ -1366,6 +1555,7 @@ impl std::io::Write for Hasher {
 /// from an unknown position in the output stream to recover its block index. Callers with strong
 /// secret keys aren't affected in practice, but secret offsets are a [design
 /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case.
+#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 pub struct OutputReader {
     inner: Output,
diff --git a/third-party/blake3/src/platform.rs b/third-party/blake3/src/platform.rs
index 00058b16..79bc9a3f 100644
--- a/third-party/blake3/src/platform.rs
+++ b/third-party/blake3/src/platform.rs
@@ -56,6 +56,11 @@ pub enum Platform {
 impl Platform {
     #[allow(unreachable_code)]
     pub fn detect() -> Self {
+        #[cfg(miri)]
+        {
+            return Platform::Portable;
+        }
+
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
             #[cfg(blake3_avx512_ffi)]
@@ -327,7 +332,12 @@ impl Platform {
 #[cfg(blake3_avx512_ffi)]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
+#[allow(unreachable_code)]
 pub fn avx512_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
     // A testing-only short-circuit.
     if cfg!(feature = "no_avx512") {
         return false;
@@ -349,7 +359,12 @@ pub fn avx512_detected() -> bool {
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
+#[allow(unreachable_code)]
 pub fn avx2_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
     // A testing-only short-circuit.
     if cfg!(feature = "no_avx2") {
         return false;
@@ -371,7 +386,12 @@ pub fn avx2_detected() -> bool {
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
+#[allow(unreachable_code)]
 pub fn sse41_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
     // A testing-only short-circuit.
     if cfg!(feature = "no_sse41") {
         return false;
@@ -395,6 +415,10 @@ pub fn sse41_detected() -> bool {
 #[inline(always)]
 #[allow(unreachable_code)]
 pub fn sse2_detected() -> bool {
+    if cfg!(miri) {
+        return false;
+    }
+
     // A testing-only short-circuit.
     if cfg!(feature = "no_sse2") {
         return false;
diff --git a/third-party/blake3/src/test.rs b/third-party/blake3/src/test.rs
index 60bbe8cc..c76cbbc0 100644
--- a/third-party/blake3/src/test.rs
+++ b/third-party/blake3/src/test.rs
@@ -628,3 +628,211 @@ const fn test_hash_const_conversions() {
     let hash = crate::Hash::from_bytes(bytes);
     _ = hash.as_bytes();
 }
+
+#[cfg(feature = "zeroize")]
+#[test]
+fn test_zeroize() {
+    use zeroize::Zeroize;
+
+    let mut hash = crate::Hash([42; 32]);
+    hash.zeroize();
+    assert_eq!(hash.0, [0u8; 32]);
+
+    let mut hasher = crate::Hasher {
+        chunk_state: crate::ChunkState {
+            cv: [42; 8],
+            chunk_counter: 42,
+            buf: [42; 64],
+            buf_len: 42,
+            blocks_compressed: 42,
+            flags: 42,
+            platform: crate::Platform::Portable,
+        },
+        key: [42; 8],
+        cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(),
+    };
+    hasher.zeroize();
+    assert_eq!(hasher.chunk_state.cv, [0; 8]);
+    assert_eq!(hasher.chunk_state.chunk_counter, 0);
+    assert_eq!(hasher.chunk_state.buf, [0; 64]);
+    assert_eq!(hasher.chunk_state.buf_len, 0);
+    assert_eq!(hasher.chunk_state.blocks_compressed, 0);
+    assert_eq!(hasher.chunk_state.flags, 0);
+    assert!(matches!(
+        hasher.chunk_state.platform,
+        crate::Platform::Portable
+    ));
+    assert_eq!(hasher.key, [0; 8]);
+    assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);
+
+    let mut output_reader = crate::OutputReader {
+        inner: crate::Output {
+            input_chaining_value: [42; 8],
+            block: [42; 64],
+            counter: 42,
+            block_len: 42,
+            flags: 42,
+            platform: crate::Platform::Portable,
+        },
+        position_within_block: 42,
+    };
+
+    output_reader.zeroize();
+    assert_eq!(output_reader.inner.input_chaining_value, [0; 8]);
+    assert_eq!(output_reader.inner.block, [0; 64]);
+    assert_eq!(output_reader.inner.counter, 0);
+    assert_eq!(output_reader.inner.block_len, 0);
+    assert_eq!(output_reader.inner.flags, 0);
+    assert!(matches!(
+        output_reader.inner.platform,
+        crate::Platform::Portable
+    ));
+    assert_eq!(output_reader.position_within_block, 0);
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_update_reader() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_reader() is mostly a wrapper around update(), which already
+    // has substantial testing.
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    assert_eq!(
+        crate::Hasher::new().update_reader(&input[..])?.finalize(),
+        crate::hash(&input),
+    );
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_update_reader_interrupted() -> std::io::Result<()> {
+    use std::io;
+    struct InterruptingReader<'a> {
+        already_interrupted: bool,
+        slice: &'a [u8],
+    }
+    impl<'a> InterruptingReader<'a> {
+        fn new(slice: &'a [u8]) -> Self {
+            Self {
+                already_interrupted: false,
+                slice,
+            }
+        }
+    }
+    impl<'a> io::Read for InterruptingReader<'a> {
+        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+            if !self.already_interrupted {
+                self.already_interrupted = true;
+                return Err(io::Error::from(io::ErrorKind::Interrupted));
+            }
+            let take = std::cmp::min(self.slice.len(), buf.len());
+            buf[..take].copy_from_slice(&self.slice[..take]);
+            self.slice = &self.slice[take..];
+            Ok(take)
+        }
+    }
+
+    let input = b"hello world";
+    let mut reader = InterruptingReader::new(input);
+    let mut hasher = crate::Hasher::new();
+    hasher.update_reader(&mut reader)?;
+    assert_eq!(hasher.finalize(), crate::hash(input));
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+// NamedTempFile isn't Miri-compatible
+#[cfg(not(miri))]
+fn test_mmap() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already
+    // has substantial testing.
+    use std::io::prelude::*;
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    let mut tempfile = tempfile::NamedTempFile::new()?;
+    tempfile.write_all(&input)?;
+    tempfile.flush()?;
+    assert_eq!(
+        crate::Hasher::new()
+            .update_mmap(tempfile.path())?
+            .finalize(),
+        crate::hash(&input),
+    );
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+#[cfg(target_os = "linux")]
+fn test_mmap_virtual_file() -> Result<(), std::io::Error> {
+    // Virtual files like /proc/version can't be mmapped, because their contents don't actually
+    // exist anywhere in memory. Make sure we fall back to regular file IO in these cases.
+    // Currently this is handled with a length check, where the assumption is that virtual files
+    // will always report length 0. If that assumption ever breaks, hopefully this test will catch
+    // it.
+    let virtual_filepath = "/proc/version";
+    let mut mmap_hasher = crate::Hasher::new();
+    // We'll fail right here if the fallback doesn't work.
+    mmap_hasher.update_mmap(virtual_filepath)?;
+    let mut read_hasher = crate::Hasher::new();
+    read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?;
+    assert_eq!(mmap_hasher.finalize(), read_hasher.finalize());
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+#[cfg(feature = "rayon")]
+// NamedTempFile isn't Miri-compatible
+#[cfg(not(miri))]
+fn test_mmap_rayon() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(),
+    // which already has substantial testing.
+    use std::io::prelude::*;
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    let mut tempfile = tempfile::NamedTempFile::new()?;
+    tempfile.write_all(&input)?;
+    tempfile.flush()?;
+    assert_eq!(
+        crate::Hasher::new()
+            .update_mmap_rayon(tempfile.path())?
+            .finalize(),
+        crate::hash(&input),
+    );
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "std")]
+#[cfg(feature = "serde")]
+fn test_serde() {
+    let hash: crate::Hash = [7; 32].into();
+    let json = serde_json::to_string(&hash).unwrap();
+    assert_eq!(
+        json,
+        "[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]",
+    );
+    let hash2: crate::Hash = serde_json::from_str(&json).unwrap();
+    assert_eq!(hash, hash2);
+}
+
+// `cargo +nightly miri test` currently works, but it takes forever, because some of our test
+// inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri
+// anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming
+// they don't use incompatible features like Rayon or mmap. This test should get reasonable
+// coverage of our public API without using any large inputs, so we can run it in CI and catch
+// obvious breaks. (For example, constant_time_eq is not compatible with Miri.)
+#[test]
+fn test_miri_smoketest() {
+    let mut hasher = crate::Hasher::new_derive_key("Miri smoketest");
+    hasher.update(b"foo");
+    #[cfg(feature = "std")]
+    hasher.update_reader(&b"bar"[..]).unwrap();
+    assert_eq!(hasher.finalize(), hasher.finalize());
+    let mut reader = hasher.finalize_xof();
+    reader.set_position(999999);
+    reader.fill(&mut [0]);
+}
diff --git a/third-party/blake3/tools/release.md b/third-party/blake3/tools/release.md
index 17a07b0f..924f3279 100644
--- a/third-party/blake3/tools/release.md
+++ b/third-party/blake3/tools/release.md
@@ -4,7 +4,7 @@
 - Bump the version in the root Cargo.toml.
 - Bump the version in b3sum/Cargo.toml.
 - Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar.
-- Update the `--help` output in b3sum/README.md if it's changed.
+- Update the `-h` output in b3sum/README.md if it's changed.
 - Bump `BLAKE3_VERSION_STRING` in c/blake3.h.
 - Bump `VERSION` in c/CMakeLists.txt.
 - Make a version bump commit with change notes.