From 5576b88604f6519a60b23317783538a308c0b6d8 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Fri, 19 Apr 2024 12:25:41 +0900 Subject: [PATCH] Upgrade blake3 in ./third-party to v1.5.1 --- third-party/blake3/.git-blame-ignore-revs | 2 + third-party/blake3/.github/workflows/ci.yml | 77 +- third-party/blake3/.github/workflows/tag.yml | 8 +- third-party/blake3/Cargo.toml | 37 +- third-party/blake3/README.md | 2 + third-party/blake3/b3sum/Cargo.lock | 482 +++----- third-party/blake3/b3sum/Cargo.toml | 5 +- third-party/blake3/b3sum/src/main.rs | 162 +-- third-party/blake3/build.rs | 22 +- third-party/blake3/c/CMakeLists.txt | 67 +- third-party/blake3/c/blake3.c | 19 +- third-party/blake3/c/blake3.h | 2 +- .../c/blake3_c_rust_bindings/Cargo.toml | 6 +- .../c/blake3_c_rust_bindings/src/test.rs | 2 +- third-party/blake3/c/blake3_dispatch.c | 39 +- third-party/blake3/c/blake3_impl.h | 6 +- third-party/blake3/c/blake3_neon.c | 6 +- third-party/blake3/rust/guts/Cargo.toml | 18 + third-party/blake3/rust/guts/readme.md | 80 ++ third-party/blake3/rust/guts/src/lib.rs | 1000 +++++++++++++++++ third-party/blake3/rust/guts/src/portable.rs | 262 +++++ third-party/blake3/rust/guts/src/test.rs | 523 +++++++++ third-party/blake3/src/io.rs | 79 ++ third-party/blake3/src/lib.rs | 304 ++++- third-party/blake3/src/platform.rs | 24 + third-party/blake3/src/test.rs | 208 ++++ third-party/blake3/tools/release.md | 2 +- 27 files changed, 2855 insertions(+), 589 deletions(-) create mode 100644 third-party/blake3/.git-blame-ignore-revs create mode 100644 third-party/blake3/rust/guts/Cargo.toml create mode 100644 third-party/blake3/rust/guts/readme.md create mode 100644 third-party/blake3/rust/guts/src/lib.rs create mode 100644 third-party/blake3/rust/guts/src/portable.rs create mode 100644 third-party/blake3/rust/guts/src/test.rs create mode 100644 third-party/blake3/src/io.rs diff --git a/third-party/blake3/.git-blame-ignore-revs b/third-party/blake3/.git-blame-ignore-revs new file mode 100644 index 00000000..6e814e69 --- /dev/null +++ b/third-party/blake3/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# CMakeLists.txt whitespace fixups +3e14f865d30271c74fc68d417af488ea91b66d48 diff --git a/third-party/blake3/.github/workflows/ci.yml b/third-party/blake3/.github/workflows/ci.yml index c1a88aaf..e93ecb38 100644 --- a/third-party/blake3/.github/workflows/ci.yml +++ b/third-party/blake3/.github/workflows/ci.yml @@ -38,12 +38,10 @@ jobs: ] steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} - profile: minimal - override: true # Print the compiler version, for debugging. - name: print compiler version run: cargo run --quiet @@ -52,13 +50,17 @@ jobs: - name: print instruction set support run: cargo run --quiet working-directory: ./tools/instruction_set_support - # Default tests plus Rayon and RustCrypto trait implementations. - - run: cargo test --features=rayon,traits-preview + # Default tests plus Rayon and trait implementations. + - run: cargo test --features=rayon,traits-preview,serde,zeroize # Same but with only one thread in the Rayon pool. This can find deadlocks. - name: "again with RAYON_NUM_THREADS=1" - run: cargo test --features=rayon,traits-preview + run: cargo test --features=rayon,traits-preview,serde,zeroize env: RAYON_NUM_THREADS: 1 + # The mmap feature by itself (update_mmap_rayon is omitted). + - run: cargo test --features=mmap + # All public features put together. + - run: cargo test --features=mmap,rayon,traits-preview,serde,zeroize # no_std tests. - run: cargo test --no-default-features @@ -129,6 +131,17 @@ jobs: run: cargo test working-directory: ./reference_impl + # the new guts crate + - name: guts test + run: cargo test --all-features + working-directory: ./rust/guts + - name: guts no_std build + run: cargo build --no-default-features + working-directory: ./rust/guts + - name: guts no_std test # note that rust/guts/src/test.rs still uses libstd + run: cargo test --no-default-features + working-directory: ./rust/guts + b3sum_tests: name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} @@ -148,16 +161,14 @@ jobs: # The b3sum MSRV is sometimes higher than the blake3 crate's, because # b3sum depends on Clap. We check in the b3sum Cargo.lock, so Clap # update shouldn't randomly break us here. - "1.66.1", + "1.74.1", ] steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} - profile: minimal - override: true # Test b3sum. - name: test b3sum run: cargo test @@ -177,14 +188,13 @@ jobs: - i686-unknown-linux-musl - armv7-unknown-linux-gnueabihf - aarch64-unknown-linux-gnu - - mips-unknown-linux-gnu + # Big-endian targets. See https://twitter.com/burntsushi5/status/1695483429997945092. + - powerpc64-unknown-linux-gnu + - s390x-unknown-linux-gnu steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable - run: cargo install cross # Test the portable implementation on everything. - run: cross test --target ${{ matrix.arch }} @@ -210,7 +220,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # Test the intrinsics-based implementations. - run: make -f Makefile.testing test working-directory: ./c @@ -262,12 +272,10 @@ jobs: strategy: fail-fast: false steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable with: - toolchain: stable - target: aarch64-apple-darwin - override: true + targets: aarch64-apple-darwin - name: build blake3 run: cargo build --target aarch64-apple-darwin - name: build b3sum @@ -278,7 +286,7 @@ jobs: name: build with the Tiny C Compiler runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: install TCC run: sudo apt-get install -y tcc - name: compile @@ -295,7 +303,7 @@ jobs: name: "compile and test with GCC 5.4" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: addnab/docker-run-action@v3 with: image: gcc:5.4 @@ -308,7 +316,7 @@ jobs: # CMake build test (Library only), current macOS/Linux only. cmake_build: - name: CMake ${{ matrix.os }} + name: CMake ${{ matrix.os }} ${{ matrix.compiler }} runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -323,8 +331,21 @@ jobs: - os: macOS-latest compiler: msvc steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: CMake generation run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target - name: CMake build / install run: cmake --build c/build --target install + + miri_smoketest: + name: Miri smoketest + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@nightly + with: + components: miri + # Currently the test search "miri" only matches "test_miri_smoketest", but + # we might add more. If this accidentally picks up anything incompatible or + # slow, we can narrow it. + - run: cargo miri test miri diff --git a/third-party/blake3/.github/workflows/tag.yml b/third-party/blake3/.github/workflows/tag.yml index 3f7e886b..61be4ff9 100644 --- a/third-party/blake3/.github/workflows/tag.yml +++ b/third-party/blake3/.github/workflows/tag.yml @@ -23,18 +23,16 @@ jobs: ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.x" - run: pip install PyGithub - run: sudo apt-get install musl-tools if: matrix.target.os == 'ubuntu-latest' - - uses: actions-rs/toolchain@v1 + - uses: dtolnay/rust-toolchain@stable with: - toolchain: stable - profile: minimal - - run: rustup target add ${{ matrix.target.rust-target }} + targets: ${{ matrix.target.rust-target }} - name: build b3sum id: build_b3sum run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} diff --git a/third-party/blake3/Cargo.toml b/third-party/blake3/Cargo.toml index 8df13874..55eb8a41 100644 --- a/third-party/blake3/Cargo.toml +++ b/third-party/blake3/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "blake3" -version = "1.4.1" +version = "1.5.1" authors = ["Jack O'Connor ", "Samuel Neves"] description = "the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" @@ -23,11 +23,21 @@ neon = [] # --no-default-features, the only way to use the SIMD implementations in this # crate is to enable the corresponding instruction sets statically for the # entire build, with e.g. RUSTFLAGS="-C target-cpu=native". -std = ["digest/std"] +std = [] -# The "rayon" feature (defined below as an optional dependency) enables the -# `Hasher::update_rayon` method, for multithreaded hashing. However, even if -# this feature is enabled, all other APIs remain single-threaded. +# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the +# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon` +# methods, for multithreaded hashing. However, even if this feature is enabled, +# all other APIs remain single-threaded. +rayon = ["dep:rayon", "std"] + +# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the +# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon` +# helper methods for memory-mapped IO. +mmap = ["std", "dep:memmap2"] + +# Implement the zeroize::Zeroize trait for types in this crate. +zeroize = ["dep:zeroize", "arrayvec/zeroize"] # This crate implements traits from the RustCrypto project, exposed here as the # "traits-preview" feature. However, these traits aren't stable, and they're @@ -78,24 +88,29 @@ no_avx512 = [] no_neon = [] [package.metadata.docs.rs] -# Document Hasher::update_rayon on docs.rs. -features = ["rayon"] +# Document the rayon/mmap methods and the Serialize/Deserialize/Zeroize impls on docs.rs. +features = ["mmap", "rayon", "serde", "zeroize"] [dependencies] arrayref = "0.3.5" -arrayvec = { version = "0.7.0", default-features = false } +arrayvec = { version = "0.7.4", default-features = false } constant_time_eq = "0.3.0" -rayon = { version = "1.2.1", optional = true } cfg-if = "1.0.0" digest = { version = "0.10.1", features = [ "mac" ], optional = true } +memmap2 = { version = "0.9", optional = true } +rayon = { version = "1.2.1", optional = true } +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true } [dev-dependencies] +hmac = "0.12.0" hex = "0.4.2" -page_size = "0.5.0" +page_size = "0.6.0" rand = "0.8.0" rand_chacha = "0.3.0" reference_impl = { path = "./reference_impl" } -hmac = "0.12.0" +tempfile = "3.8.0" +serde_json = "1.0.107" [build-dependencies] cc = "1.0.4" diff --git a/third-party/blake3/README.md b/third-party/blake3/README.md index a63d5f2c..6b493775 100644 --- a/third-party/blake3/README.md +++ b/third-party/blake3/README.md @@ -201,6 +201,7 @@ Alternatively, it is licensed under the Apache License 2.0. Here's a (non-exhaustive) list of protocols and software that use BLAKE3: * [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala) +* [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0) * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16) * [IPFS](https://github.com/ipfs/go-verifcid/issues/13) * [Farcaster](https://www.farcaster.xyz/) @@ -211,6 +212,7 @@ Here's a (non-exhaustive) list of protocols and software that use BLAKE3: * [Saito](https://saito.tech/) * [Skale](https://github.com/skalenetwork/skale-consensus/pull/284) * [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html) +* [Tekken 8](https://en.bandainamcoent.eu/tekken/tekken-8) * [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21) diff --git a/third-party/blake3/b3sum/Cargo.lock b/third-party/blake3/b3sum/Cargo.lock index 2a599a85..2300d3bf 100644 --- a/third-party/blake3/b3sum/Cargo.lock +++ b/third-party/blake3/b3sum/Cargo.lock @@ -4,58 +4,57 @@ version = 3 [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.1" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" [[package]] name = "anstyle-parse" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anyhow" -version = "1.0.71" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" +checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" [[package]] name = "arrayref" @@ -69,22 +68,15 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - [[package]] name = "b3sum" -version = "1.4.1" +version = "1.5.1" dependencies = [ "anyhow", "blake3", "clap", "duct", "hex", - "memmap2", "rayon", "tempfile", "wild", @@ -92,43 +84,28 @@ dependencies = [ [[package]] name = "bitflags" -version = "1.3.2" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "blake3" -version = "1.4.1" +version = "1.5.1" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "digest", + "memmap2", "rayon", ] -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "cc" -version = "1.0.79" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" [[package]] name = "cfg-if" @@ -138,20 +115,19 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.3.11" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" +checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.11" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ "anstream", "anstyle", @@ -162,9 +138,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.3.2" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", "proc-macro2", @@ -174,9 +150,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "colorchoice" @@ -190,75 +166,36 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" -[[package]] -name = "crossbeam-channel" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "duct" -version = "0.13.6" +version = "0.13.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ae3fc31835f74c2a7ceda3aeede378b0ae2e74c8f1c36559fcc9ae2a4e7d3e" +checksum = "e4ab5718d1224b63252cd0c6f74f6480f9ffeb117438a2e0f5cf6d9a4798929c" dependencies = [ "libc", "once_cell", @@ -268,49 +205,25 @@ dependencies = [ [[package]] name = "either" -version = "1.8.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" [[package]] name = "errno" -version = "0.3.1" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] name = "fastrand" -version = "1.9.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] - -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "glob" @@ -324,134 +237,72 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" -[[package]] -name = "hermit-abi" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" - [[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "io-lifetimes" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys", -] - -[[package]] -name = "is-terminal" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" -dependencies = [ - "hermit-abi", - "rustix 0.38.3", - "windows-sys", -] - [[package]] name = "libc" -version = "0.2.147" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "linux-raw-sys" -version = "0.3.8" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" - -[[package]] -name = "linux-raw-sys" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "memmap2" -version = "0.7.1" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" dependencies = [ "libc", ] -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "os_pipe" -version = "1.1.4" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177" +checksum = "57119c3b893986491ec9aa85056780d3a0f3cf4da7cc09dd3650dbd6c6738fb9" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "proc-macro2" -version = "1.0.63" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.29" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.7.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" dependencies = [ "either", "rayon-core", @@ -459,58 +310,27 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.11.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ - "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "num_cpus", -] - -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", ] [[package]] name = "rustix" -version = "0.37.23" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys 0.3.8", - "windows-sys", -] - -[[package]] -name = "rustix" -version = "0.38.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4" -dependencies = [ - "bitflags 2.3.3", + "bitflags", "errno", "libc", - "linux-raw-sys 0.4.3", - "windows-sys", + "linux-raw-sys", + "windows-sys 0.52.0", ] -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - [[package]] name = "shared_child" version = "1.0.0" @@ -523,21 +343,15 @@ dependencies = [ [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - -[[package]] -name = "subtle" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" [[package]] name = "syn" -version = "2.0.23" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -546,39 +360,31 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.6.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ - "autocfg", "cfg-if", "fastrand", - "redox_syscall", - "rustix 0.37.23", - "windows-sys", + "rustix", + "windows-sys 0.52.0", ] [[package]] name = "terminal_size" -version = "0.2.6" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ - "rustix 0.37.23", - "windows-sys", + "rustix", + "windows-sys 0.48.0", ] -[[package]] -name = "typenum" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" - [[package]] name = "unicode-ident" -version = "1.0.10" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "utf8parse" @@ -586,17 +392,11 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - [[package]] name = "wild" -version = "2.1.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b116685a6be0c52f5a103334cbff26db643826c7b3735fc0a3ba9871310a74" +checksum = "a3131afc8c575281e1e80f36ed6a092aa502c08b18ed7524e86fbbb12bb410e1" dependencies = [ "glob", ] @@ -629,62 +429,128 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", ] [[package]] name = "windows-targets" -version = "0.48.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" diff --git a/third-party/blake3/b3sum/Cargo.toml b/third-party/blake3/b3sum/Cargo.toml index 02c9405f..812ed224 100644 --- a/third-party/blake3/b3sum/Cargo.toml +++ b/third-party/blake3/b3sum/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "b3sum" -version = "1.4.1" +version = "1.5.1" authors = ["Jack O'Connor "] description = "a command line implementation of the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" @@ -15,10 +15,9 @@ pure = ["blake3/pure"] [dependencies] anyhow = "1.0.25" -blake3 = { version = "1", path = "..", features = ["rayon"] } +blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] } clap = { version = "4.0.8", features = ["derive", "wrap_help"] } hex = "0.4.0" -memmap2 = "0.7.0" rayon = "1.2.1" wild = "2.0.3" diff --git a/third-party/blake3/b3sum/src/main.rs b/third-party/blake3/b3sum/src/main.rs index fd35f686..228737ff 100644 --- a/third-party/blake3/b3sum/src/main.rs +++ b/third-party/blake3/b3sum/src/main.rs @@ -163,125 +163,22 @@ impl Args { } } -enum Input { - Mmap(io::Cursor), - File(File), - Stdin, -} - -impl Input { - // Open an input file, using mmap if appropriate. "-" means stdin. Note - // that this convention applies both to command line arguments, and to - // filepaths that appear in a checkfile. - fn open(path: &Path, args: &Args) -> Result { - if path == Path::new("-") { - if args.keyed() { - bail!("Cannot open `-` in keyed mode"); - } - return Ok(Self::Stdin); +fn hash_path(args: &Args, path: &Path) -> Result { + let mut hasher = args.base_hasher.clone(); + if path == Path::new("-") { + if args.keyed() { + bail!("Cannot open `-` in keyed mode"); } - let file = File::open(path)?; - if !args.no_mmap() { - if let Some(mmap) = maybe_memmap_file(&file)? { - return Ok(Self::Mmap(io::Cursor::new(mmap))); - } - } - Ok(Self::File(file)) - } - - fn hash(&mut self, args: &Args) -> Result { - let mut hasher = args.base_hasher.clone(); - match self { - // The fast path: If we mmapped the file successfully, hash using - // multiple threads. This doesn't work on stdin, or on some files, - // and it can also be disabled with --no-mmap. - Self::Mmap(cursor) => { - hasher.update_rayon(cursor.get_ref()); - } - // The slower paths, for stdin or files we didn't/couldn't mmap. - // This is currently all single-threaded. Doing multi-threaded - // hashing without memory mapping is tricky, since all your worker - // threads have to stop every time you refill the buffer, and that - // ends up being a lot of overhead. To solve that, we need a more - // complicated double-buffering strategy where a background thread - // fills one buffer while the worker threads are hashing the other - // one. We might implement that in the future, but since this is - // the slow path anyway, it's not high priority. - Self::File(file) => { - copy_wide(file, &mut hasher)?; - } - Self::Stdin => { - let stdin = io::stdin(); - let lock = stdin.lock(); - copy_wide(lock, &mut hasher)?; - } - } - let mut output_reader = hasher.finalize_xof(); - output_reader.set_position(args.seek()); - Ok(output_reader) - } -} - -impl Read for Input { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Mmap(cursor) => cursor.read(buf), - Self::File(file) => file.read(buf), - Self::Stdin => io::stdin().read(buf), - } - } -} - -// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets -// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms -// can support at least 64 KiB, and there's some performance benefit to using -// bigger reads, so that's what we use here. -fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result { - let mut buffer = [0; 65536]; - let mut total = 0; - loop { - match reader.read(&mut buffer) { - Ok(0) => return Ok(total), - Ok(n) => { - hasher.update(&buffer[..n]); - total += n as u64; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => return Err(e), - } - } -} - -// Mmap a file, if it looks like a good idea. Return None in cases where we -// know mmap will fail, or if the file is short enough that mmapping isn't -// worth it. However, if we do try to mmap and it fails, return the error. -fn maybe_memmap_file(file: &File) -> Result> { - let metadata = file.metadata()?; - let file_size = metadata.len(); - Ok(if !metadata.is_file() { - // Not a real file. - None - } else if file_size > isize::max_value() as u64 { - // Too long to safely map. - // https://github.com/danburkert/memmap-rs/issues/69 - None - } else if file_size == 0 { - // Mapping an empty file currently fails. - // https://github.com/danburkert/memmap-rs/issues/72 - None - } else if file_size < 16 * 1024 { - // Mapping small files is not worth it. - None + hasher.update_reader(io::stdin().lock())?; + } else if args.no_mmap() { + hasher.update_reader(File::open(path)?)?; } else { - // Explicitly set the length of the memory map, so that filesystem - // changes can't race to violate the invariants we just checked. - let map = unsafe { - memmap2::MmapOptions::new() - .len(file_size as usize) - .map(file)? - }; - Some(map) - }) + // The fast path: Try to mmap the file and hash it with multiple threads. + hasher.update_mmap_rayon(path)?; + } + let mut output_reader = hasher.finalize_xof(); + output_reader.set_position(args.seek()); + Ok(output_reader) } fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { @@ -477,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result { } fn hash_one_input(path: &Path, args: &Args) -> Result<()> { - let mut input = Input::open(path, args)?; - let output = input.hash(args)?; + let output = hash_path(args, path)?; if args.raw() { write_raw_output(output, args)?; return Ok(()); @@ -522,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool { } else { file_string }; - let hash_result: Result = Input::open(&file_path, args) - .and_then(|mut input| input.hash(args)) - .map(|mut hash_output| { + let found_hash: blake3::Hash; + match hash_path(args, &file_path) { + Ok(mut output) => { let mut found_hash_bytes = [0; blake3::OUT_LEN]; - hash_output.fill(&mut found_hash_bytes); - found_hash_bytes.into() - }); - let found_hash: blake3::Hash = match hash_result { - Ok(hash) => hash, + output.fill(&mut found_hash_bytes); + found_hash = found_hash_bytes.into(); + } Err(e) => { println!("{}: FAILED ({})", file_string, e); return false; @@ -549,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool { } fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> { - let checkfile_input = Input::open(path, args)?; - let mut bufreader = io::BufReader::new(checkfile_input); + let mut file; + let stdin; + let mut stdin_lock; + let mut bufreader: io::BufReader<&mut dyn Read>; + if path == Path::new("-") { + stdin = io::stdin(); + stdin_lock = stdin.lock(); + bufreader = io::BufReader::new(&mut stdin_lock); + } else { + file = File::open(path)?; + bufreader = io::BufReader::new(&mut file); + } let mut line = String::new(); loop { line.clear(); diff --git a/third-party/blake3/build.rs b/third-party/blake3/build.rs index ac1d6a64..a5dfd062 100644 --- a/third-party/blake3/build.rs +++ b/third-party/blake3/build.rs @@ -60,6 +60,20 @@ fn is_armv7() -> bool { target_components()[0] == "armv7" } +fn endianness() -> String { + let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); + assert!(endianness == "little" || endianness == "big"); + endianness +} + +fn is_little_endian() -> bool { + endianness() == "little" +} + +fn is_big_endian() -> bool { + endianness() == "big" +} + // Windows targets may be using the MSVC toolchain or the GNU toolchain. The // right compiler flags to use depend on the toolchain. (And we don't want to // use flag_if_supported, because we don't want features to be silently @@ -253,7 +267,13 @@ fn main() -> Result<(), Box> { } } - if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64()) { + if is_neon() && is_big_endian() { + panic!("The NEON implementation doesn't support big-endian ARM.") + } + + if (is_arm() && is_neon()) + || (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian()) + { println!("cargo:rustc-cfg=blake3_neon"); build_neon_c_intrinsics(); } diff --git a/third-party/blake3/c/CMakeLists.txt b/third-party/blake3/c/CMakeLists.txt index 3190effa..3a3b232d 100644 --- a/third-party/blake3/c/CMakeLists.txt +++ b/third-party/blake3/c/CMakeLists.txt @@ -1,15 +1,23 @@ -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.9 FATAL_ERROR) + +# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD +if (POLICY CMP0128) + cmake_policy(SET CMP0128 NEW) +endif() project(libblake3 - VERSION 1.4.1 + VERSION 1.5.1 DESCRIPTION "BLAKE3 C implementation" LANGUAGES C ASM ) -include(CheckCCompilerFlag) include(FeatureSummary) include(GNUInstallDirs) +# architecture lists for which to enable assembly / SIMD sources +set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) +set(BLAKE3_X86_NAMES i686 x86 X86) +set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) if(MSVC) set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") @@ -25,11 +33,13 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") + + if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + # 32-bit ARMv8 needs NEON to be enabled explicitly + set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON") + endif() endif() -# architecture lists for which to enable assembly / SIMD sources -set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) -set(BLAKE3_X86_NAMES i686 x86 X86) -set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) # library target add_library(blake3 @@ -42,26 +52,40 @@ add_library(BLAKE3::blake3 ALIAS blake3) # library configuration set(BLAKE3_PKGCONFIG_CFLAGS) if (BUILD_SHARED_LIBS) - target_compile_definitions(blake3 + target_compile_definitions(blake3 PUBLIC BLAKE3_DLL PRIVATE BLAKE3_DLL_EXPORTS ) list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL) endif() -target_include_directories(blake3 PUBLIC $) +target_include_directories(blake3 PUBLIC + $ + $ +) set_target_properties(blake3 PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 0 C_VISIBILITY_PRESET hidden + C_EXTENSIONS OFF ) +target_compile_features(blake3 PUBLIC c_std_99) +# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD +# which may be set by the user or toolchain file +if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD) + set_target_properties(blake3 PROPERTIES C_STANDARD 99) +endif() # optional SIMD sources macro(BLAKE3_DISABLE_SIMD) set(BLAKE3_SIMD_AMD64_ASM OFF) set(BLAKE3_SIMD_X86_INTRINSICS OFF) set(BLAKE3_SIMD_NEON_INTRINSICS OFF) - set_source_files_properties(blake3_dispatch.c PROPERTIES - COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512 + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=0 + BLAKE3_NO_SSE2 + BLAKE3_NO_SSE41 + BLAKE3_NO_AVX2 + BLAKE3_NO_AVX512 ) endmacro() @@ -100,7 +124,7 @@ if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM) BLAKE3_DISABLE_SIMD() endif() - else() + else() BLAKE3_DISABLE_SIMD() endif() @@ -122,22 +146,19 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRIN set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") -elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES - OR ((ANDROID_ABI STREQUAL "armeabi-v7a" - OR BLAKE3_USE_NEON_INTRINSICS) - AND (DEFINED BLAKE3_CFLAGS_NEON - OR CMAKE_SIZEOF_VOID_P EQUAL 8))) +elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + OR ANDROID_ABI STREQUAL "armeabi-v7a" + OR BLAKE3_USE_NEON_INTRINSICS) + AND (DEFINED BLAKE3_CFLAGS_NEON + OR CMAKE_SIZEOF_VOID_P EQUAL 8)) set(BLAKE3_SIMD_NEON_INTRINSICS ON) target_sources(blake3 PRIVATE blake3_neon.c ) - target_compile_options(blake3 PRIVATE -DBLAKE3_USE_NEON=1) - - check_c_compiler_flag(-mfpu=neon BLAKE3_MFPU_NEON_SUPPORTED) - if (BLAKE3_MFPU_NEON_SUPPORTED) - target_compile_options(blake3 PRIVATE -mfpu=neon) - endif() + target_compile_definitions(blake3 PRIVATE + BLAKE3_USE_NEON=1 + ) if (DEFINED BLAKE3_CFLAGS_NEON) set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") diff --git a/third-party/blake3/c/blake3.c b/third-party/blake3/c/blake3.c index 692f4b02..1b44c719 100644 --- a/third-party/blake3/c/blake3.c +++ b/third-party/blake3/c/blake3.c @@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node( size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because + // as we just asserted, num_cvs will always be <=2 in that case. But GCC + // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is + // set then it emits incorrect warnings here. We tried a few different + // hacks to silence these, but in the end our hacks just produced different + // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of + // desperation, we ifdef out this entire loop when we know it's not needed. +#if MAX_SIMD_DEGREE_OR_2 > 2 + // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; - // The second half of this loop condition is always true, and we just - // asserted it above. But GCC can't tell that it's always true, and if NDEBUG - // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious - // warnings here. GCC 8.5 is particularly sensitive, so if you're changing - // this code, test it against that version. - while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { + while (num_cvs > 2) { num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); } +#endif memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } diff --git a/third-party/blake3/c/blake3.h b/third-party/blake3/c/blake3.h index 21e0d7b9..48284e50 100644 --- a/third-party/blake3/c/blake3.h +++ b/third-party/blake3/c/blake3.h @@ -30,7 +30,7 @@ extern "C" { #endif -#define BLAKE3_VERSION_STRING "1.4.1" +#define BLAKE3_VERSION_STRING "1.5.1" #define BLAKE3_KEY_LEN 32 #define BLAKE3_OUT_LEN 32 #define BLAKE3_BLOCK_LEN 64 diff --git a/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml index fff9f416..c1aee32e 100644 --- a/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml +++ b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml @@ -20,9 +20,9 @@ neon = [] [dev-dependencies] arrayref = "0.3.5" arrayvec = { version = "0.7.0", default-features = false } -page_size = "0.4.1" -rand = "0.7.2" -rand_chacha = "0.2.1" +page_size = "0.6.0" +rand = "0.8.5" +rand_chacha = "0.3.1" reference_impl = { path = "../../reference_impl" } [build-dependencies] diff --git a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs index 1fc077c8..0730d930 100644 --- a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs +++ b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs @@ -485,7 +485,7 @@ fn test_fuzz_hasher() { let mut total_input = 0; // For each test, write 3 inputs of random length. for _ in 0..3 { - let input_len = rng.gen_range(0, INPUT_MAX + 1); + let input_len = rng.gen_range(0..INPUT_MAX + 1); dbg!(input_len); let input = &input_buf[total_input..][..input_len]; hasher.update(input); diff --git a/third-party/blake3/c/blake3_dispatch.c b/third-party/blake3/c/blake3_dispatch.c index 2ab0093e..af6c3dad 100644 --- a/third-party/blake3/c/blake3_dispatch.c +++ b/third-party/blake3/c/blake3_dispatch.c @@ -6,6 +6,7 @@ #if defined(IS_X86) #if defined(_MSC_VER) +#include #include #elif defined(__GNUC__) #include @@ -14,6 +15,32 @@ #endif #endif +#if !defined(BLAKE3_ATOMICS) +#if defined(__has_include) +#if __has_include() && !defined(_MSC_VER) +#define BLAKE3_ATOMICS 1 +#else +#define BLAKE3_ATOMICS 0 +#endif /* __has_include() && !defined(_MSC_VER) */ +#else +#define BLAKE3_ATOMICS 0 +#endif /* defined(__has_include) */ +#endif /* BLAKE3_ATOMICS */ + +#if BLAKE3_ATOMICS +#define ATOMIC_INT _Atomic int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#elif defined(_MSC_VER) +#define ATOMIC_INT LONG +#define ATOMIC_LOAD(x) InterlockedOr(&x, 0) +#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) +#else +#define ATOMIC_INT int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#endif + #define MAYBE_UNUSED(x) (void)((x)) #if defined(IS_X86) @@ -76,7 +103,7 @@ enum cpu_feature { #if !defined(BLAKE3_TESTING) static /* Allow the variable to be controlled manually for testing */ #endif - enum cpu_feature g_cpu_features = UNDEFINED; + ATOMIC_INT g_cpu_features = UNDEFINED; #if !defined(BLAKE3_TESTING) static @@ -84,14 +111,16 @@ static enum cpu_feature get_cpu_features(void) { - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; + /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ + enum cpu_feature features = ATOMIC_LOAD(g_cpu_features); + if (features != UNDEFINED) { + return features; } else { #if defined(IS_X86) uint32_t regs[4] = {0}; uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; (void)edx; - enum cpu_feature features = 0; + features = 0; cpuid(regs, 0); const int max_id = *eax; cpuid(regs, 1); @@ -124,7 +153,7 @@ static } } } - g_cpu_features = features; + ATOMIC_STORE(g_cpu_features, features); return features; #else /* How to detect NEON? */ diff --git a/third-party/blake3/c/blake3_impl.h b/third-party/blake3/c/blake3_impl.h index 3ba9ceb0..beab5cf5 100644 --- a/third-party/blake3/c/blake3_impl.h +++ b/third-party/blake3/c/blake3_impl.h @@ -51,7 +51,11 @@ enum blake3_flags { #if !defined(BLAKE3_USE_NEON) // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness #if defined(IS_AARCH64) - #define BLAKE3_USE_NEON 1 + #if defined(__ARM_BIG_ENDIAN) + #define BLAKE3_USE_NEON 0 + #else + #define BLAKE3_USE_NEON 1 + #endif #else #define BLAKE3_USE_NEON 0 #endif diff --git a/third-party/blake3/c/blake3_neon.c b/third-party/blake3/c/blake3_neon.c index 8a818fc7..90bdd572 100644 --- a/third-party/blake3/c/blake3_neon.c +++ b/third-party/blake3/c/blake3_neon.c @@ -10,14 +10,12 @@ INLINE uint32x4_t loadu_128(const uint8_t src[16]) { // vld1q_u32 has alignment requirements. Don't use it. - uint32x4_t x; - memcpy(&x, src, 16); - return x; + return vreinterpretq_u32_u8(vld1q_u8(src)); } INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { // vst1q_u32 has alignment requirements. Don't use it. - memcpy(dest, &src, 16); + vst1q_u8(dest, vreinterpretq_u8_u32(src)); } INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { diff --git a/third-party/blake3/rust/guts/Cargo.toml b/third-party/blake3/rust/guts/Cargo.toml new file mode 100644 index 00000000..ebcf77fd --- /dev/null +++ b/third-party/blake3/rust/guts/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "blake3_guts" +version = "0.0.0" +authors = ["Jack O'Connor ", "Samuel Neves"] +description = "low-level building blocks for the BLAKE3 hash function" +repository = "https://github.com/BLAKE3-team/BLAKE3" +license = "CC0-1.0 OR Apache-2.0" +documentation = "https://docs.rs/blake3_guts" +readme = "readme.md" +edition = "2021" + +[dev-dependencies] +hex = "0.4.3" +reference_impl = { path = "../../reference_impl" } + +[features] +default = ["std"] +std = [] diff --git a/third-party/blake3/rust/guts/readme.md b/third-party/blake3/rust/guts/readme.md new file mode 100644 index 00000000..4957816d --- /dev/null +++ b/third-party/blake3/rust/guts/readme.md @@ -0,0 +1,80 @@ +# The BLAKE3 Guts API + +## Introduction + +This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains +low-level, high-performance, platform-specific implementations of the BLAKE3 +compression function. This API is complicated and unsafe, and this crate will +never have a stable release. Most callers should instead use the +[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend +on this one internally. + +The code you see here (as of January 2024) is an early stage of a large planned +refactor. The motivation for this refactor is a couple of missing features in +both the Rust and C implementations: + +- The output side + ([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html) + in Rust) doesn't take advantage of the most important SIMD optimizations that + compute multiple blocks in parallel. This blocks any project that wants to + use the BLAKE3 XOF as a stream cipher + ([[1]](https://github.com/oconnor663/bessie), + [[2]](https://github.com/oconnor663/blake3_aead)). +- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need + interior nodes of the tree also don't get those SIMD optimizations. They have + to use a slow, minimalistic, unstable, doc-hidden module [(also called + `guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs). + +The difficulty with adding those features is that they require changes to all +of our optimized assembly and C intrinsics code. That's a couple dozen +different files that are large, platform-specific, difficult to understand, and +full of duplicated code. The higher-level Rust and C implementations of BLAKE3 +both depend on these files and will need to coordinate changes. + +At the same time, it won't be long before we add support for more platforms: + +- RISCV vector extensions +- ARM SVE +- WebAssembly SIMD + +It's important to get this refactor done before new platforms make it even +harder to do. + +## The private guts API + +This is the API that each platform reimplements, so we want it to be as simple +as possible apart from the high-performance work it needs to do. It's +completely `unsafe`, and inputs and outputs are raw pointers that are allowed +to alias (this matters for `hash_parents`, see below). + +- `degree` +- `compress` + - The single compression function, for short inputs and odd-length tails. +- `hash_chunks` +- `hash_parents` +- `xof` +- `xof_xor` + - As `xof` but XOR'ing the result into the output buffer. +- `universal_hash` + - This is a new construction specifically to support + [BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some + implementations might just stub it out with portable code. + +## The public guts API + +This is the API that this crate exposes to callers, i.e. to the main `blake3` +crate. It's a thin, portable layer on top of the private API above. The Rust +version of this API is memory-safe. + +- `degree` +- `compress` +- `hash_chunks` +- `hash_parents` + - This handles most levels of the tree, where we keep hashing SIMD_DEGREE + parents at a time. +- `reduce_parents` + - This uses the same `hash_parents` private API, but it handles the top + levels of the tree where we reduce in-place to the root parent node. +- `xof` +- `xof_xor` +- `universal_hash` diff --git a/third-party/blake3/rust/guts/src/lib.rs b/third-party/blake3/rust/guts/src/lib.rs new file mode 100644 index 00000000..e9b4914b --- /dev/null +++ b/third-party/blake3/rust/guts/src/lib.rs @@ -0,0 +1,1000 @@ +//! # The BLAKE3 Guts API +//! +//! See `readme.md`. +//! +//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`] +//! that atomically initializes itself the first time you use it. +//! +//! # Example +//! +//! ```rust +//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT}; +//! +//! // Hash an input of exactly two chunks. +//! let input = [0u8; 2048]; +//! let mut outputs = TransposedVectors::new(); +//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs); +//! DETECTED_IMPL.hash_chunks( +//! &input, +//! &IV_BYTES, +//! 0, // counter +//! 0, // flags +//! left_outputs, +//! ); +//! let root_node = outputs.extract_parent_node(0); +//! let hash = DETECTED_IMPL.compress( +//! &root_node, +//! 64, // block_len +//! &IV_BYTES, +//! 0, // counter +//! PARENT | ROOT, +//! ); +//! +//! // Compute the same hash using the reference implementation. +//! let mut reference_hasher = reference_impl::Hasher::new(); +//! reference_hasher.update(&input); +//! let mut expected_hash = [0u8; 32]; +//! reference_hasher.finalize(&mut expected_hash); +//! +//! assert_eq!(hash, expected_hash); +//! ``` + +// Tests always require libstd. +#![cfg_attr(all(not(feature = "std"), not(test)), no_std)] + +use core::cmp; +use core::marker::PhantomData; +use core::mem; +use core::ptr; +use core::sync::atomic::{AtomicPtr, Ordering::Relaxed}; + +pub mod portable; + +#[cfg(test)] +mod test; + +pub const OUT_LEN: usize = 32; +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; +pub const WORD_LEN: usize = 4; +pub const UNIVERSAL_HASH_LEN: usize = 16; + +pub const CHUNK_START: u32 = 1 << 0; +pub const CHUNK_END: u32 = 1 << 1; +pub const PARENT: u32 = 1 << 2; +pub const ROOT: u32 = 1 << 3; +pub const KEYED_HASH: u32 = 1 << 4; +pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +pub const IV: CVWords = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; +pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV); + +pub const MSG_SCHEDULE: [[usize; 16]; 7] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], +]; + +// never less than 2 +pub const MAX_SIMD_DEGREE: usize = 2; + +pub type CVBytes = [u8; 32]; +pub type CVWords = [u32; 8]; +pub type BlockBytes = [u8; 64]; +pub type BlockWords = [u32; 16]; + +pub static DETECTED_IMPL: Implementation = Implementation::new( + degree_init, + compress_init, + hash_chunks_init, + hash_parents_init, + xof_init, + xof_xor_init, + universal_hash_init, +); + +fn detect() -> Implementation { + portable::implementation() +} + +fn init_detected_impl() { + let detected = detect(); + + DETECTED_IMPL + .degree_ptr + .store(detected.degree_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .compress_ptr + .store(detected.compress_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .hash_chunks_ptr + .store(detected.hash_chunks_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .hash_parents_ptr + .store(detected.hash_parents_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .xof_ptr + .store(detected.xof_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .xof_xor_ptr + .store(detected.xof_xor_ptr.load(Relaxed), Relaxed); + DETECTED_IMPL + .universal_hash_ptr + .store(detected.universal_hash_ptr.load(Relaxed), Relaxed); +} + +pub struct Implementation { + degree_ptr: AtomicPtr<()>, + compress_ptr: AtomicPtr<()>, + hash_chunks_ptr: AtomicPtr<()>, + hash_parents_ptr: AtomicPtr<()>, + xof_ptr: AtomicPtr<()>, + xof_xor_ptr: AtomicPtr<()>, + universal_hash_ptr: AtomicPtr<()>, +} + +impl Implementation { + const fn new( + degree_fn: DegreeFn, + compress_fn: CompressFn, + hash_chunks_fn: HashChunksFn, + hash_parents_fn: HashParentsFn, + xof_fn: XofFn, + xof_xor_fn: XofFn, + universal_hash_fn: UniversalHashFn, + ) -> Self { + Self { + degree_ptr: AtomicPtr::new(degree_fn as *mut ()), + compress_ptr: AtomicPtr::new(compress_fn as *mut ()), + hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()), + hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()), + xof_ptr: AtomicPtr::new(xof_fn as *mut ()), + xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()), + universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()), + } + } + + #[inline] + fn degree_fn(&self) -> DegreeFn { + unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) } + } + + #[inline] + pub fn degree(&self) -> usize { + let degree = unsafe { self.degree_fn()() }; + debug_assert!(degree >= 2); + debug_assert!(degree <= MAX_SIMD_DEGREE); + debug_assert_eq!(1, degree.count_ones(), "power of 2"); + degree + } + + #[inline] + pub fn split_transposed_vectors<'v>( + &self, + vectors: &'v mut TransposedVectors, + ) -> (TransposedSplit<'v>, TransposedSplit<'v>) { + unsafe { vectors.split(self.degree()) } + } + + #[inline] + fn compress_fn(&self) -> CompressFn { + unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) } + } + + #[inline] + pub fn compress( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + counter: u64, + flags: u32, + ) -> CVBytes { + let mut out = [0u8; 32]; + unsafe { + self.compress_fn()(block, block_len, cv, counter, flags, &mut out); + } + out + } + + // The contract for HashChunksFn doesn't require the implementation to support single-chunk + // inputs. Instead we handle that case here by calling compress in a loop. + #[inline] + fn hash_one_chunk( + &self, + mut input: &[u8], + key: &CVBytes, + counter: u64, + mut flags: u32, + output: TransposedSplit, + ) { + debug_assert!(input.len() <= CHUNK_LEN); + let mut cv = *key; + flags |= CHUNK_START; + while input.len() > BLOCK_LEN { + cv = self.compress( + input[..BLOCK_LEN].try_into().unwrap(), + BLOCK_LEN as u32, + &cv, + counter, + flags, + ); + input = &input[BLOCK_LEN..]; + flags &= !CHUNK_START; + } + let mut final_block = [0u8; BLOCK_LEN]; + final_block[..input.len()].copy_from_slice(input); + cv = self.compress( + &final_block, + input.len() as u32, + &cv, + counter, + flags | CHUNK_END, + ); + unsafe { + write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr); + } + } + + #[inline] + fn hash_chunks_fn(&self) -> HashChunksFn { + unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) } + } + + #[inline] + pub fn hash_chunks( + &self, + input: &[u8], + key: &CVBytes, + counter: u64, + flags: u32, + transposed_output: TransposedSplit, + ) -> usize { + debug_assert!(input.len() <= self.degree() * CHUNK_LEN); + if input.len() <= CHUNK_LEN { + // The underlying hash_chunks_fn isn't required to support this case. Instead we handle + // it by calling compress_fn in a loop. But note that we still don't support root + // finalization or the empty input here. + self.hash_one_chunk(input, key, counter, flags, transposed_output); + return 1; + } + // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently + // ignore the remainder. This makes it impossible to write out of bounds in a properly + // constructed TransposedSplit. + let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN); + unsafe { + self.hash_chunks_fn()( + input.as_ptr(), + len, + key, + counter, + flags, + transposed_output.ptr, + ); + } + if input.len() % CHUNK_LEN == 0 { + input.len() / CHUNK_LEN + } else { + (input.len() / CHUNK_LEN) + 1 + } + } + + #[inline] + fn hash_parents_fn(&self) -> HashParentsFn { + unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) } + } + + #[inline] + pub fn hash_parents( + &self, + transposed_input: &TransposedVectors, + mut num_cvs: usize, + key: &CVBytes, + flags: u32, + transposed_output: TransposedSplit, + ) -> usize { + debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); + // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. + num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); + let mut odd_cv = [0u32; 8]; + if num_cvs % 2 == 1 { + unsafe { + odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1)); + } + } + let num_parents = num_cvs / 2; + unsafe { + self.hash_parents_fn()( + transposed_input.as_ptr(), + num_parents, + key, + flags | PARENT, + transposed_output.ptr, + ); + } + if num_cvs % 2 == 1 { + unsafe { + write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents)); + } + num_parents + 1 + } else { + num_parents + } + } + + #[inline] + pub fn reduce_parents( + &self, + transposed_in_out: &mut TransposedVectors, + mut num_cvs: usize, + key: &CVBytes, + flags: u32, + ) -> usize { + debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); + // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. + num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); + let in_out_ptr = transposed_in_out.as_mut_ptr(); + let mut odd_cv = [0u32; 8]; + if num_cvs % 2 == 1 { + unsafe { + odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1)); + } + } + let num_parents = num_cvs / 2; + unsafe { + self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr); + } + if num_cvs % 2 == 1 { + unsafe { + write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents)); + } + num_parents + 1 + } else { + num_parents + } + } + + #[inline] + fn xof_fn(&self) -> XofFn { + unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) } + } + + #[inline] + pub fn xof( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + mut counter: u64, + flags: u32, + mut out: &mut [u8], + ) { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + while !out.is_empty() { + let take = cmp::min(simd_len, out.len()); + unsafe { + self.xof_fn()( + block, + block_len, + cv, + counter, + flags | ROOT, + out.as_mut_ptr(), + take, + ); + } + out = &mut out[take..]; + counter += degree as u64; + } + } + + #[inline] + fn xof_xor_fn(&self) -> XofFn { + unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) } + } + + #[inline] + pub fn xof_xor( + &self, + block: &BlockBytes, + block_len: u32, + cv: &CVBytes, + mut counter: u64, + flags: u32, + mut out: &mut [u8], + ) { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + while !out.is_empty() { + let take = cmp::min(simd_len, out.len()); + unsafe { + self.xof_xor_fn()( + block, + block_len, + cv, + counter, + flags | ROOT, + out.as_mut_ptr(), + take, + ); + } + out = &mut out[take..]; + counter += degree as u64; + } + } + + #[inline] + fn universal_hash_fn(&self) -> UniversalHashFn { + unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) } + } + + #[inline] + pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] { + let degree = self.degree(); + let simd_len = degree * BLOCK_LEN; + let mut ret = [0u8; 16]; + while !input.is_empty() { + let take = cmp::min(simd_len, input.len()); + let mut output = [0u8; 16]; + unsafe { + self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output); + } + input = &input[take..]; + counter += degree as u64; + for byte_index in 0..16 { + ret[byte_index] ^= output[byte_index]; + } + } + ret + } +} + +impl Clone for Implementation { + fn clone(&self) -> Self { + Self { + degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)), + compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)), + hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)), + hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)), + xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)), + xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)), + universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)), + } + } +} + +// never less than 2 +type DegreeFn = unsafe extern "C" fn() -> usize; + +unsafe extern "C" fn degree_init() -> usize { + init_detected_impl(); + DETECTED_IMPL.degree_fn()() +} + +type CompressFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, // may overlap the input +); + +unsafe extern "C" fn compress_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, +) { + init_detected_impl(); + DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out); +} + +type CompressXofFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut BlockBytes, // may overlap the input +); + +type HashChunksFn = unsafe extern "C" fn( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +); + +unsafe extern "C" fn hash_chunks_init( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +) { + init_detected_impl(); + DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output); +} + +type HashParentsFn = unsafe extern "C" fn( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, // may overlap the input +); + +unsafe extern "C" fn hash_parents_init( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, +) { + init_detected_impl(); + DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output); +} + +// This signature covers both xof() and xof_xor(). +type XofFn = unsafe extern "C" fn( + block: *const BlockBytes, // zero padded to 64 bytes + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +); + +unsafe extern "C" fn xof_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + init_detected_impl(); + DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len); +} + +unsafe extern "C" fn xof_xor_init( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + init_detected_impl(); + DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len); +} + +type UniversalHashFn = unsafe extern "C" fn( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +); + +unsafe extern "C" fn universal_hash_init( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +) { + init_detected_impl(); + DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out); +} + +// The implicit degree of this implementation is MAX_SIMD_DEGREE. +#[inline(always)] +unsafe fn hash_chunks_using_compress( + compress: CompressFn, + mut input: *const u8, + mut input_len: usize, + key: *const CVBytes, + mut counter: u64, + flags: u32, + mut transposed_output: *mut u32, +) { + debug_assert!(input_len > 0); + debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN); + input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN); + while input_len > 0 { + let mut chunk_len = cmp::min(input_len, CHUNK_LEN); + input_len -= chunk_len; + // We only use 8 words of the CV, but compress returns 16. + let mut cv = *key; + let cv_ptr: *mut CVBytes = &mut cv; + let mut chunk_flags = flags | CHUNK_START; + while chunk_len > BLOCK_LEN { + compress( + input as *const BlockBytes, + BLOCK_LEN as u32, + cv_ptr, + counter, + chunk_flags, + cv_ptr, + ); + input = input.add(BLOCK_LEN); + chunk_len -= BLOCK_LEN; + chunk_flags &= !CHUNK_START; + } + let mut last_block = [0u8; BLOCK_LEN]; + ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len); + input = input.add(chunk_len); + compress( + &last_block, + chunk_len as u32, + cv_ptr, + counter, + chunk_flags | CHUNK_END, + cv_ptr, + ); + let cv_words = words_from_le_bytes_32(&cv); + for word_index in 0..8 { + transposed_output + .add(word_index * TRANSPOSED_STRIDE) + .write(cv_words[word_index]); + } + transposed_output = transposed_output.add(1); + counter += 1; + } +} + +// The implicit degree of this implementation is MAX_SIMD_DEGREE. +#[inline(always)] +unsafe fn hash_parents_using_compress( + compress: CompressFn, + mut transposed_input: *const u32, + mut num_parents: usize, + key: *const CVBytes, + flags: u32, + mut transposed_output: *mut u32, // may overlap the input +) { + debug_assert!(num_parents > 0); + debug_assert!(num_parents <= MAX_SIMD_DEGREE); + while num_parents > 0 { + let mut block_bytes = [0u8; 64]; + for word_index in 0..8 { + let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read(); + block_bytes[WORD_LEN * word_index..][..WORD_LEN] + .copy_from_slice(&left_child_word.to_le_bytes()); + let right_child_word = transposed_input + .add(word_index * TRANSPOSED_STRIDE + 1) + .read(); + block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN] + .copy_from_slice(&right_child_word.to_le_bytes()); + } + let mut cv = [0u8; 32]; + compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv); + let cv_words = words_from_le_bytes_32(&cv); + for word_index in 0..8 { + transposed_output + .add(word_index * TRANSPOSED_STRIDE) + .write(cv_words[word_index]); + } + transposed_input = transposed_input.add(2); + transposed_output = transposed_output.add(1); + num_parents -= 1; + } +} + +#[inline(always)] +unsafe fn xof_using_compress_xof( + compress_xof: CompressXofFn, + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + mut counter: u64, + flags: u32, + mut out: *mut u8, + mut out_len: usize, +) { + debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); + while out_len > 0 { + let mut block_output = [0u8; 64]; + compress_xof(block, block_len, cv, counter, flags, &mut block_output); + let take = cmp::min(out_len, BLOCK_LEN); + ptr::copy_nonoverlapping(block_output.as_ptr(), out, take); + out = out.add(take); + out_len -= take; + counter += 1; + } +} + +#[inline(always)] +unsafe fn xof_xor_using_compress_xof( + compress_xof: CompressXofFn, + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + mut counter: u64, + flags: u32, + mut out: *mut u8, + mut out_len: usize, +) { + debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); + while out_len > 0 { + let mut block_output = [0u8; 64]; + compress_xof(block, block_len, cv, counter, flags, &mut block_output); + let take = cmp::min(out_len, BLOCK_LEN); + for i in 0..take { + *out.add(i) ^= block_output[i]; + } + out = out.add(take); + out_len -= take; + counter += 1; + } +} + +#[inline(always)] +unsafe fn universal_hash_using_compress( + compress: CompressFn, + mut input: *const u8, + mut input_len: usize, + key: *const CVBytes, + mut counter: u64, + out: *mut [u8; 16], +) { + let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT; + let mut result = [0u8; 16]; + while input_len > 0 { + let block_len = cmp::min(input_len, BLOCK_LEN); + let mut block = [0u8; BLOCK_LEN]; + ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len); + let mut block_output = [0u8; 32]; + compress( + &block, + block_len as u32, + key, + counter, + flags, + &mut block_output, + ); + for i in 0..16 { + result[i] ^= block_output[i]; + } + input = input.add(block_len); + input_len -= block_len; + counter += 1; + } + *out = result; +} + +// this is in units of *words*, for pointer operations on *const/*mut u32 +const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE; + +#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]); + +impl TransposedVectors { + pub fn new() -> Self { + Self([[0; 2 * MAX_SIMD_DEGREE]; 8]) + } + + pub fn extract_cv(&self, cv_index: usize) -> CVBytes { + let mut words = [0u32; 8]; + for word_index in 0..8 { + words[word_index] = self.0[word_index][cv_index]; + } + le_bytes_from_words_32(&words) + } + + pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes { + let mut bytes = [0u8; 64]; + bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2)); + bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1)); + bytes + } + + fn as_ptr(&self) -> *const u32 { + self.0[0].as_ptr() + } + + fn as_mut_ptr(&mut self) -> *mut u32 { + self.0[0].as_mut_ptr() + } + + // SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not + // necessarily correct) to write up to `degree` words to either side of the split, possibly + // from different threads. + unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) { + debug_assert!(degree > 0); + debug_assert!(degree <= MAX_SIMD_DEGREE); + debug_assert_eq!(degree.count_ones(), 1, "power of 2"); + let ptr = self.as_mut_ptr(); + let left = TransposedSplit { + ptr, + phantom_data: PhantomData, + }; + let right = TransposedSplit { + ptr: ptr.wrapping_add(degree), + phantom_data: PhantomData, + }; + (left, right) + } +} + +pub struct TransposedSplit<'vectors> { + ptr: *mut u32, + phantom_data: PhantomData<&'vectors mut u32>, +} + +unsafe impl<'vectors> Send for TransposedSplit<'vectors> {} +unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {} + +unsafe fn read_transposed_cv(src: *const u32) -> CVWords { + let mut cv = [0u32; 8]; + for word_index in 0..8 { + let offset_words = word_index * TRANSPOSED_STRIDE; + cv[word_index] = src.add(offset_words).read(); + } + cv +} + +unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) { + for word_index in 0..8 { + let offset_words = word_index * TRANSPOSED_STRIDE; + dest.add(offset_words).write(cv[word_index]); + } +} + +#[inline(always)] +pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes { + let mut bytes = [0u8; 32]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < bytes.len() / WORD_LEN { + let word_bytes = words[word_index].to_le_bytes(); + let mut byte_index = 0; + while byte_index < WORD_LEN { + bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; + byte_index += 1; + } + word_index += 1; + } + bytes +} + +#[inline(always)] +pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes { + let mut bytes = [0u8; 64]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < bytes.len() / WORD_LEN { + let word_bytes = words[word_index].to_le_bytes(); + let mut byte_index = 0; + while byte_index < WORD_LEN { + bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; + byte_index += 1; + } + word_index += 1; + } + bytes +} + +#[inline(always)] +pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords { + let mut words = [0u32; 8]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < words.len() { + let mut word_bytes = [0u8; WORD_LEN]; + let mut byte_index = 0; + while byte_index < WORD_LEN { + word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; + byte_index += 1; + } + words[word_index] = u32::from_le_bytes(word_bytes); + word_index += 1; + } + words +} + +#[inline(always)] +pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords { + let mut words = [0u32; 16]; + // This loop is super verbose because currently that's what it takes to be const. + let mut word_index = 0; + while word_index < words.len() { + let mut word_bytes = [0u8; WORD_LEN]; + let mut byte_index = 0; + while byte_index < WORD_LEN { + word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; + byte_index += 1; + } + words[word_index] = u32::from_le_bytes(word_bytes); + word_index += 1; + } + words +} + +#[test] +fn test_byte_word_round_trips() { + let cv = *b"This is 32 LE bytes/eight words."; + assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv))); + let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words."; + assert_eq!( + block, + le_bytes_from_words_64(&words_from_le_bytes_64(&block)), + ); +} + +// The largest power of two less than or equal to `n`, used for left_len() +// immediately below, and also directly in Hasher::update(). +pub fn largest_power_of_two_leq(n: usize) -> usize { + ((n / 2) + 1).next_power_of_two() +} + +#[test] +fn test_largest_power_of_two_leq() { + let input_output = &[ + // The zero case is nonsensical, but it does work. + (0, 1), + (1, 1), + (2, 2), + (3, 2), + (4, 4), + (5, 4), + (6, 4), + (7, 4), + (8, 8), + // the largest possible usize + (usize::MAX, (usize::MAX >> 1) + 1), + ]; + for &(input, output) in input_output { + assert_eq!( + output, + crate::largest_power_of_two_leq(input), + "wrong output for n={}", + input + ); + } +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +pub fn left_len(content_len: usize) -> usize { + debug_assert!(content_len > CHUNK_LEN); + // Subtract 1 to reserve at least one byte for the right side. + let full_chunks = (content_len - 1) / CHUNK_LEN; + largest_power_of_two_leq(full_chunks) * CHUNK_LEN +} + +#[test] +fn test_left_len() { + let input_output = &[ + (CHUNK_LEN + 1, CHUNK_LEN), + (2 * CHUNK_LEN - 1, CHUNK_LEN), + (2 * CHUNK_LEN, CHUNK_LEN), + (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN, 2 * CHUNK_LEN), + (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), + ]; + for &(input, output) in input_output { + assert_eq!(left_len(input), output); + } +} diff --git a/third-party/blake3/rust/guts/src/portable.rs b/third-party/blake3/rust/guts/src/portable.rs new file mode 100644 index 00000000..d5976440 --- /dev/null +++ b/third-party/blake3/rust/guts/src/portable.rs @@ -0,0 +1,262 @@ +use crate::{ + le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64, + BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE, +}; + +const DEGREE: usize = MAX_SIMD_DEGREE; + +unsafe extern "C" fn degree() -> usize { + DEGREE +} + +#[inline(always)] +fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +#[inline(always)] +fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) { + // Select the message schedule based on the round. + let schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +#[inline(always)] +fn compress_inner( + block_words: &BlockWords, + block_len: u32, + cv_words: &CVWords, + counter: u64, + flags: u32, +) -> [u32; 16] { + let mut state = [ + cv_words[0], + cv_words[1], + cv_words[2], + cv_words[3], + cv_words[4], + cv_words[5], + cv_words[6], + cv_words[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter as u32, + (counter >> 32) as u32, + block_len as u32, + flags as u32, + ]; + for round_number in 0..7 { + round(&mut state, &block_words, round_number); + } + state +} + +pub(crate) unsafe extern "C" fn compress( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, +) { + let block_words = words_from_le_bytes_64(&*block); + let cv_words = words_from_le_bytes_32(&*cv); + let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); + for word_index in 0..8 { + state[word_index] ^= state[word_index + 8]; + } + *out = le_bytes_from_words_32(state[..8].try_into().unwrap()); +} + +pub(crate) unsafe extern "C" fn compress_xof( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut BlockBytes, +) { + let block_words = words_from_le_bytes_64(&*block); + let cv_words = words_from_le_bytes_32(&*cv); + let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); + for word_index in 0..8 { + state[word_index] ^= state[word_index + 8]; + state[word_index + 8] ^= cv_words[word_index]; + } + *out = le_bytes_from_words_64(&state); +} + +pub(crate) unsafe extern "C" fn hash_chunks( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + flags: u32, + transposed_output: *mut u32, +) { + crate::hash_chunks_using_compress( + compress, + input, + input_len, + key, + counter, + flags, + transposed_output, + ) +} + +pub(crate) unsafe extern "C" fn hash_parents( + transposed_input: *const u32, + num_parents: usize, + key: *const CVBytes, + flags: u32, + transposed_output: *mut u32, // may overlap the input +) { + crate::hash_parents_using_compress( + compress, + transposed_input, + num_parents, + key, + flags, + transposed_output, + ) +} + +pub(crate) unsafe extern "C" fn xof( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + crate::xof_using_compress_xof( + compress_xof, + block, + block_len, + cv, + counter, + flags, + out, + out_len, + ) +} + +pub(crate) unsafe extern "C" fn xof_xor( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut u8, + out_len: usize, +) { + crate::xof_xor_using_compress_xof( + compress_xof, + block, + block_len, + cv, + counter, + flags, + out, + out_len, + ) +} + +pub(crate) unsafe extern "C" fn universal_hash( + input: *const u8, + input_len: usize, + key: *const CVBytes, + counter: u64, + out: *mut [u8; 16], +) { + crate::universal_hash_using_compress(compress, input, input_len, key, counter, out) +} + +pub fn implementation() -> Implementation { + Implementation::new( + degree, + compress, + hash_chunks, + hash_parents, + xof, + xof_xor, + universal_hash, + ) +} + +#[cfg(test)] +mod test { + use super::*; + + // This is circular but do it anyway. + #[test] + fn test_compress_vs_portable() { + crate::test::test_compress_vs_portable(&implementation()); + } + + #[test] + fn test_compress_vs_reference() { + crate::test::test_compress_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_hash_chunks_vs_portable() { + crate::test::test_hash_chunks_vs_portable(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_hash_parents_vs_portable() { + crate::test::test_hash_parents_vs_portable(&implementation()); + } + + #[test] + fn test_chunks_and_parents_vs_reference() { + crate::test::test_chunks_and_parents_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_xof_vs_portable() { + crate::test::test_xof_vs_portable(&implementation()); + } + + #[test] + fn test_xof_vs_reference() { + crate::test::test_xof_vs_reference(&implementation()); + } + + // This is circular but do it anyway. + #[test] + fn test_universal_hash_vs_portable() { + crate::test::test_universal_hash_vs_portable(&implementation()); + } + + #[test] + fn test_universal_hash_vs_reference() { + crate::test::test_universal_hash_vs_reference(&implementation()); + } +} diff --git a/third-party/blake3/rust/guts/src/test.rs b/third-party/blake3/rust/guts/src/test.rs new file mode 100644 index 00000000..83bd790c --- /dev/null +++ b/third-party/blake3/rust/guts/src/test.rs @@ -0,0 +1,523 @@ +use crate::*; + +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; + +// Test a few different initial counter values. +// - 0: The base case. +// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when +// you're supposed to ANDNOT. +// - u32::MAX: The low word of the counter overflows for all inputs except the first. +// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word. +const INITIAL_COUNTERS: [u64; 4] = [ + 0, + i32::MAX as u64, + u32::MAX as u64, + (42u64 << 32) + u32::MAX as u64, +]; + +const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64]; + +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +pub fn test_compress_vs_portable(test_impl: &Implementation) { + for block_len in BLOCK_LENGTHS { + dbg!(block_len); + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len]); + for counter in INITIAL_COUNTERS { + dbg!(counter); + let portable_cv = portable::implementation().compress( + &block, + block_len as u32, + &TEST_KEY, + counter, + KEYED_HASH, + ); + + let test_cv = + test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH); + + assert_eq!(portable_cv, test_cv); + } + } +} + +pub fn test_compress_vs_reference(test_impl: &Implementation) { + for block_len in BLOCK_LENGTHS { + dbg!(block_len); + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len]); + + let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + ref_hasher.update(&block[..block_len]); + let mut ref_hash = [0u8; 32]; + ref_hasher.finalize(&mut ref_hash); + + let test_cv = test_impl.compress( + &block, + block_len as u32, + &TEST_KEY, + 0, + CHUNK_START | CHUNK_END | ROOT | KEYED_HASH, + ); + + assert_eq!(ref_hash, test_cv); + } +} + +fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) { + if output_a == output_b { + return; + } + for cv_index in 0..2 * MAX_SIMD_DEGREE { + let cv_a = output_a.extract_cv(cv_index); + let cv_b = output_b.extract_cv(cv_index); + if cv_a == [0; 32] && cv_b == [0; 32] { + println!("CV {cv_index:2} empty"); + } else if cv_a == cv_b { + println!("CV {cv_index:2} matches"); + } else { + println!("CV {cv_index:2} mismatch:"); + println!(" {}", hex::encode(cv_a)); + println!(" {}", hex::encode(cv_b)); + } + } + panic!("transposed outputs are not equal"); +} + +pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) { + assert!(test_impl.degree() <= MAX_SIMD_DEGREE); + dbg!(test_impl.degree() * CHUNK_LEN); + // Allocate 4 extra bytes of padding so we can make aligned slices. + let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4]; + let mut input_slice = &mut input_buf[..]; + // Make sure the start of the input is word-aligned. + while input_slice.as_ptr() as usize % 4 != 0 { + input_slice = &mut input_slice[1..]; + } + let (aligned_input, mut unaligned_input) = + input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN); + unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN]; + assert_eq!(aligned_input.as_ptr() as usize % 4, 0); + assert_eq!(unaligned_input.as_ptr() as usize % 4, 1); + paint_test_input(aligned_input); + paint_test_input(unaligned_input); + // Try just below, equal to, and just above every whole number of chunks. + let mut input_2_lengths = Vec::new(); + let mut next_len = 2 * CHUNK_LEN; + loop { + // 95 is one whole block plus one interesting part of another + input_2_lengths.push(next_len - 95); + input_2_lengths.push(next_len); + if next_len == test_impl.degree() * CHUNK_LEN { + break; + } + input_2_lengths.push(next_len + 95); + next_len += CHUNK_LEN; + } + for input_2_len in input_2_lengths { + dbg!(input_2_len); + let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN]; + let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; + let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN]; + let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; + for initial_counter in INITIAL_COUNTERS { + dbg!(initial_counter); + // Make two calls, to test the output_column parameter. + let mut portable_output = TransposedVectors::new(); + let (portable_left, portable_right) = + test_impl.split_transposed_vectors(&mut portable_output); + portable::implementation().hash_chunks( + aligned_input1, + &IV_BYTES, + initial_counter, + 0, + portable_left, + ); + portable::implementation().hash_chunks( + aligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + portable_right, + ); + + let mut test_output = TransposedVectors::new(); + let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); + test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left); + test_impl.hash_chunks( + aligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + test_right, + ); + check_transposed_eq(&portable_output, &test_output); + + // Do the same thing with unaligned input. + let mut unaligned_test_output = TransposedVectors::new(); + let (unaligned_left, unaligned_right) = + test_impl.split_transposed_vectors(&mut unaligned_test_output); + test_impl.hash_chunks( + unaligned_input1, + &IV_BYTES, + initial_counter, + 0, + unaligned_left, + ); + test_impl.hash_chunks( + unaligned_input2, + &TEST_KEY, + initial_counter + test_impl.degree() as u64, + KEYED_HASH, + unaligned_right, + ); + check_transposed_eq(&portable_output, &unaligned_test_output); + } + } +} + +fn painted_transposed_input() -> TransposedVectors { + let mut vectors = TransposedVectors::new(); + let mut val = 0; + for col in 0..2 * MAX_SIMD_DEGREE { + for row in 0..8 { + vectors.0[row][col] = val; + val += 1; + } + } + vectors +} + +pub fn test_hash_parents_vs_portable(test_impl: &Implementation) { + assert!(test_impl.degree() <= MAX_SIMD_DEGREE); + let input = painted_transposed_input(); + for num_parents in 2..=(test_impl.degree() / 2) { + dbg!(num_parents); + let mut portable_output = TransposedVectors::new(); + let (portable_left, portable_right) = + test_impl.split_transposed_vectors(&mut portable_output); + portable::implementation().hash_parents( + &input, + 2 * num_parents, // num_cvs + &IV_BYTES, + 0, + portable_left, + ); + portable::implementation().hash_parents( + &input, + 2 * num_parents, // num_cvs + &TEST_KEY, + KEYED_HASH, + portable_right, + ); + + let mut test_output = TransposedVectors::new(); + let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); + test_impl.hash_parents( + &input, + 2 * num_parents, // num_cvs + &IV_BYTES, + 0, + test_left, + ); + test_impl.hash_parents( + &input, + 2 * num_parents, // num_cvs + &TEST_KEY, + KEYED_HASH, + test_right, + ); + + check_transposed_eq(&portable_output, &test_output); + } +} + +fn hash_with_chunks_and_parents_recurse( + test_impl: &Implementation, + input: &[u8], + counter: u64, + output: TransposedSplit, +) -> usize { + assert!(input.len() > 0); + if input.len() <= test_impl.degree() * CHUNK_LEN { + return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output); + } + let (left_input, right_input) = input.split_at(left_len(input.len())); + let mut child_output = TransposedVectors::new(); + let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output); + let mut children = + hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output); + assert_eq!(children, test_impl.degree()); + children += hash_with_chunks_and_parents_recurse( + test_impl, + right_input, + counter + (left_input.len() / CHUNK_LEN) as u64, + right_output, + ); + test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output) +} + +// Note: This test implementation doesn't support the 1-chunk-or-less case. +fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes { + // TODO: handle the 1-chunk case? + assert!(input.len() > CHUNK_LEN); + let mut cvs = TransposedVectors::new(); + // The right half of these vectors are never used. + let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs); + let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left); + while num_cvs > 2 { + num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0); + } + test_impl.compress( + &cvs.extract_parent_node(0), + BLOCK_LEN as u32, + &IV_BYTES, + 0, + PARENT | ROOT, + ) +} + +pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) { + assert_eq!(test_impl.degree().count_ones(), 1, "power of 2"); + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN; + let mut input_buf = [0u8; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try just below, equal to, and just above every whole number of chunks, except that + // root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case. + let mut test_lengths = vec![CHUNK_LEN + 1]; + let mut next_len = 2 * CHUNK_LEN; + loop { + // 95 is one whole block plus one interesting part of another + test_lengths.push(next_len - 95); + test_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + test_lengths.push(next_len + 95); + next_len += CHUNK_LEN; + } + for test_len in test_lengths { + dbg!(test_len); + let input = &input_buf[..test_len]; + + let mut ref_hasher = reference_impl::Hasher::new(); + ref_hasher.update(&input); + let mut ref_hash = [0u8; 32]; + ref_hasher.finalize(&mut ref_hash); + + let test_hash = root_hash_with_chunks_and_parents(test_impl, input); + + assert_eq!(ref_hash, test_hash); + } +} + +pub fn test_xof_vs_portable(test_impl: &Implementation) { + let flags = CHUNK_START | CHUNK_END | KEYED_HASH; + for counter in INITIAL_COUNTERS { + dbg!(counter); + for input_len in [0, 1, BLOCK_LEN] { + dbg!(input_len); + let mut input_block = [0u8; BLOCK_LEN]; + for byte_index in 0..input_len { + input_block[byte_index] = byte_index as u8 + 42; + } + // Try equal to and partway through every whole number of output blocks. + const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut output_lengths = Vec::new(); + let mut next_len = 0; + loop { + output_lengths.push(next_len); + if next_len == MAX_OUTPUT_LEN { + break; + } + output_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for output_len in output_lengths { + dbg!(output_len); + let mut portable_output = [0xff; MAX_OUTPUT_LEN]; + portable::implementation().xof( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut portable_output[..output_len], + ); + let mut test_output = [0xff; MAX_OUTPUT_LEN]; + test_impl.xof( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert_eq!(portable_output, test_output); + + // Double check that the implementation didn't overwrite. + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + + // The first XOR cancels out the output. + test_impl.xof_xor( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert!(test_output[..output_len].iter().all(|&b| b == 0)); + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + + // The second XOR restores out the output. + test_impl.xof_xor( + &input_block, + input_len as u32, + &TEST_KEY, + counter, + flags, + &mut test_output[..output_len], + ); + assert_eq!(portable_output, test_output); + assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); + } + } + } +} + +pub fn test_xof_vs_reference(test_impl: &Implementation) { + let input = b"hello world"; + let mut input_block = [0; BLOCK_LEN]; + input_block[..input.len()].copy_from_slice(input); + + const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut ref_output = [0; MAX_OUTPUT_LEN]; + let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + ref_hasher.update(input); + ref_hasher.finalize(&mut ref_output); + + // Try equal to and partway through every whole number of output blocks. + let mut output_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + output_lengths.push(next_len); + if next_len == MAX_OUTPUT_LEN { + break; + } + output_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + + for output_len in output_lengths { + dbg!(output_len); + let mut test_output = [0; MAX_OUTPUT_LEN]; + test_impl.xof( + &input_block, + input.len() as u32, + &TEST_KEY, + 0, + KEYED_HASH | CHUNK_START | CHUNK_END, + &mut test_output[..output_len], + ); + assert_eq!(ref_output[..output_len], test_output[..output_len]); + + // Double check that the implementation didn't overwrite. + assert!(test_output[output_len..].iter().all(|&b| b == 0)); + + // Do it again starting from block 1. + if output_len >= BLOCK_LEN { + test_impl.xof( + &input_block, + input.len() as u32, + &TEST_KEY, + 1, + KEYED_HASH | CHUNK_START | CHUNK_END, + &mut test_output[..output_len - BLOCK_LEN], + ); + assert_eq!( + ref_output[BLOCK_LEN..output_len], + test_output[..output_len - BLOCK_LEN], + ); + } + } +} + +pub fn test_universal_hash_vs_portable(test_impl: &Implementation) { + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut input_buf = [0; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try equal to and partway through every whole number of input blocks. + let mut input_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + input_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + input_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for input_len in input_lengths { + dbg!(input_len); + for counter in INITIAL_COUNTERS { + dbg!(counter); + let portable_output = portable::implementation().universal_hash( + &input_buf[..input_len], + &TEST_KEY, + counter, + ); + let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter); + assert_eq!(portable_output, test_output); + } + } +} + +fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] { + // The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended + // output to seek to a block. + const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE; + assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS); + let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS]; + let mut result = [0u8; UNIVERSAL_HASH_LEN]; + let mut block_start = 0; + while block_start < input.len() { + let block_len = cmp::min(input.len() - block_start, BLOCK_LEN); + let mut ref_hasher = reference_impl::Hasher::new_keyed(key); + ref_hasher.update(&input[block_start..block_start + block_len]); + ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]); + for byte_index in 0..UNIVERSAL_HASH_LEN { + result[byte_index] ^= output_buffer[block_start + byte_index]; + } + block_start += BLOCK_LEN; + } + result +} + +pub fn test_universal_hash_vs_reference(test_impl: &Implementation) { + const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; + let mut input_buf = [0; MAX_INPUT_LEN]; + paint_test_input(&mut input_buf); + // Try equal to and partway through every whole number of input blocks. + let mut input_lengths = vec![0, 1, 31]; + let mut next_len = BLOCK_LEN; + loop { + input_lengths.push(next_len); + if next_len == MAX_INPUT_LEN { + break; + } + input_lengths.push(next_len + 31); + next_len += BLOCK_LEN; + } + for input_len in input_lengths { + dbg!(input_len); + let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY); + let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0); + assert_eq!(ref_output, test_output); + } +} diff --git a/third-party/blake3/src/io.rs b/third-party/blake3/src/io.rs new file mode 100644 index 00000000..1c19881e --- /dev/null +++ b/third-party/blake3/src/io.rs @@ -0,0 +1,79 @@ +//! Helper functions for efficient IO. + +#[cfg(feature = "std")] +pub(crate) fn copy_wide( + mut reader: impl std::io::Read, + hasher: &mut crate::Hasher, +) -> std::io::Result { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + // see test_update_reader_interrupted + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + +// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or +// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it +// fails, return the error. +// +// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like +// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get +// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because +// this function is crate-private, we can guarantee that all can ever happen in the event of a race +// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of +// which should risk memory corruption in a safe caller. +// +// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no +// platform in the "real world" is ever going to do anything other than compute the "wrong answer" +// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this? +// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a +// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have +// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do +// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to +// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert +// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give +// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's +// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to +// construct a safe &i32 to the register if you're going to leak that reference to unknown callers. +// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally +// different here. Feedback needed. +#[cfg(feature = "mmap")] +pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + #[allow(clippy::if_same_then_else)] + if !metadata.is_file() { + // Not a real file. + Ok(None) + } else if file_size > isize::max_value() as u64 { + // Too long to safely map. + // https://github.com/danburkert/memmap-rs/issues/69 + Ok(None) + } else if file_size == 0 { + // Mapping an empty file currently fails. + // https://github.com/danburkert/memmap-rs/issues/72 + // See test_mmap_virtual_file. + Ok(None) + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it. + Ok(None) + } else { + // Explicitly set the length of the memory map, so that filesystem + // changes can't race to violate the invariants we just checked. + let map = unsafe { + memmap2::MmapOptions::new() + .len(file_size as usize) + .map(file)? + }; + Ok(Some(map)) + } +} diff --git a/third-party/blake3/src/lib.rs b/third-party/blake3/src/lib.rs index ac61fb27..d661cb2d 100644 --- a/third-party/blake3/src/lib.rs +++ b/third-party/blake3/src/lib.rs @@ -33,15 +33,33 @@ //! # Cargo Features //! //! The `std` feature (the only feature enabled by default) is required for -//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime -//! CPU feature detection on x86. If this feature is disabled, the only way to -//! use the x86 SIMD implementations is to enable the corresponding instruction -//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting -//! binary will not be portable to other machines. +//! implementations of the [`Write`] and [`Seek`] traits, the +//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU +//! feature detection on x86. If this feature is disabled, the only way to use +//! the x86 SIMD implementations is to enable the corresponding instruction sets +//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary +//! will not be portable to other machines. //! //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds -//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However, -//! even if this feature is enabled, all other APIs remain single-threaded. +//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap` +//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for +//! multithreaded hashing. However, even if this feature is enabled, all other +//! APIs remain single-threaded. +//! +//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the +//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above) +//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for +//! memory-mapped IO. +//! +//! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) +//! implements +//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for +//! this crate's types. +//! +//! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements +//! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and +//! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html) +//! for [`Hash`](struct@Hash). //! //! The NEON implementation is enabled by default for AArch64 but requires the //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and @@ -49,12 +67,12 @@ //! without NEON support. //! //! The `traits-preview` feature enables implementations of traits from the -//! RustCrypto [`digest`] crate, and re-exports that crate as -//! `traits::digest`. However, the traits aren't stable, and they're expected to -//! change in incompatible ways before that crate reaches 1.0. For that reason, -//! this crate makes no SemVer guarantees for this feature, and callers who use -//! it should expect breaking changes between patch versions. (The "-preview" -//! feature name follows the conventions of the RustCrypto [`signature`] crate.) +//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. +//! However, the traits aren't stable, and they're expected to change in +//! incompatible ways before that crate reaches 1.0. For that reason, this crate +//! makes no SemVer guarantees for this feature, and callers who use it should +//! expect breaking changes between patch versions. (The "-preview" feature name +//! follows the conventions of the RustCrypto [`signature`] crate.) //! //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon //! [BLAKE3]: https://blake3.io @@ -112,6 +130,7 @@ mod sse41; #[cfg(feature = "traits-preview")] pub mod traits; +mod io; mod join; use arrayref::{array_mut_ref, array_ref}; @@ -197,6 +216,8 @@ fn counter_high(counter: u64) -> u32 { /// [`from_hex`]: #method.from_hex /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] +#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] #[derive(Clone, Copy, Hash)] pub struct Hash([u8; OUT_LEN]); @@ -284,10 +305,28 @@ impl core::str::FromStr for Hash { } } +// A proper implementation of constant time equality is tricky, and we get it from the +// constant_time_eq crate instead of rolling our own. However, that crate isn't compatible with +// Miri, so we roll our own just for that. +#[cfg(miri)] +fn constant_time_eq_miri(a: &[u8], b: &[u8]) -> bool { + if a.len() != b.len() { + return false; + } + let mut x = 0; + for i in 0..a.len() { + x |= a[i] ^ b[i]; + } + x == 0 +} + /// This implementation is constant-time. impl PartialEq for Hash { #[inline] fn eq(&self, other: &Hash) -> bool { + #[cfg(miri)] + return constant_time_eq_miri(&self.0, &other.0); + #[cfg(not(miri))] constant_time_eq::constant_time_eq_32(&self.0, &other.0) } } @@ -296,6 +335,9 @@ impl PartialEq for Hash { impl PartialEq<[u8; OUT_LEN]> for Hash { #[inline] fn eq(&self, other: &[u8; OUT_LEN]) -> bool { + #[cfg(miri)] + return constant_time_eq_miri(&self.0, other); + #[cfg(not(miri))] constant_time_eq::constant_time_eq_32(&self.0, other) } } @@ -304,6 +346,9 @@ impl PartialEq<[u8; OUT_LEN]> for Hash { impl PartialEq<[u8]> for Hash { #[inline] fn eq(&self, other: &[u8]) -> bool { + #[cfg(miri)] + return constant_time_eq_miri(&self.0, other); + #[cfg(not(miri))] constant_time_eq::constant_time_eq(&self.0, other) } } @@ -371,6 +416,7 @@ impl std::error::Error for HexError {} // Each chunk or parent node can produce either a 32-byte chaining value or, by // setting the ROOT flag, any number of final output bytes. The Output struct // captures the state just prior to choosing between those two possibilities. +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] #[derive(Clone)] struct Output { input_chaining_value: CVWords, @@ -378,6 +424,7 @@ struct Output { block_len: u8, counter: u64, flags: u8, + #[cfg_attr(feature = "zeroize", zeroize(skip))] platform: Platform, } @@ -414,6 +461,7 @@ impl Output { } #[derive(Clone)] +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] struct ChunkState { cv: CVWords, chunk_counter: u64, @@ -421,6 +469,7 @@ struct ChunkState { buf_len: u8, blocks_compressed: u8, flags: u8, + #[cfg_attr(feature = "zeroize", zeroize(skip))] platform: Platform, } @@ -903,6 +952,9 @@ fn parent_node_output( /// An incremental hash state that can accept any number of writes. /// +/// The `rayon` and `mmap` Cargo features enable additional methods on this +/// type related to multithreading and memory-mapped IO. +/// /// When the `traits-preview` Cargo feature is enabled, this type implements /// several commonly used traits from the /// [`digest`](https://crates.io/crates/digest) crate. However, those @@ -911,15 +963,6 @@ fn parent_node_output( /// guarantees for this feature, and callers who use it should expect breaking /// changes between patch versions. /// -/// When the `rayon` Cargo feature is enabled, the -/// [`update_rayon`](#method.update_rayon) method is available for multithreaded -/// hashing. -/// -/// **Performance note:** The [`update`](#method.update) method can't take full -/// advantage of SIMD optimizations if its input buffer is too small or oddly -/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently -/// supported SIMD instruction sets. -/// /// # Examples /// /// ``` @@ -942,6 +985,7 @@ fn parent_node_output( /// # } /// ``` #[derive(Clone)] +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] pub struct Hasher { key: CVWords, chunk_state: ChunkState, @@ -1069,48 +1113,17 @@ impl Hasher { self.cv_stack.push(*new_cv); } - /// Add input bytes to the hash state. You can call this any number of - /// times. + /// Add input bytes to the hash state. You can call this any number of times. /// /// This method is always single-threaded. For multithreading support, see - /// [`update_rayon`](#method.update_rayon) below (enabled with the `rayon` - /// Cargo feature). + /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature). /// - /// Note that the degree of SIMD parallelism that `update` can use is - /// limited by the size of this input buffer. The 8 KiB buffer currently - /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but - /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to - /// leverage all currently supported SIMD instruction sets. - /// - /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html + /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of + /// this input buffer. See [`update_reader`](#method.update_reader). pub fn update(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } - /// Identical to [`update`](Hasher::update), but using Rayon-based - /// multithreading internally. - /// - /// This method is gated by the `rayon` Cargo feature, which is disabled by - /// default but enabled on [docs.rs](https://docs.rs). - /// - /// To get any performance benefit from multithreading, the input buffer - /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is - /// _slower_ than `update` for inputs under 128 KiB. That threshold varies - /// quite a lot across different processors, and it's important to benchmark - /// your specific use case. - /// - /// Memory mapping an entire input file is a simple way to take advantage of - /// multithreading without needing to carefully tune your buffer size or - /// offload IO. However, on spinning disks where random access is expensive, - /// that approach can lead to disk thrashing and terrible IO performance. - /// Note that OS page caching can mask this problem, in which case it might - /// only appear for files larger than available RAM. Again, benchmarking - /// your specific use case is important. - #[cfg(feature = "rayon")] - pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { - self.update_with_join::(input) - } - fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { // If we have some partial chunk bytes in the internal chunk_state, we // need to finish that chunk first. @@ -1309,6 +1322,182 @@ impl Hasher { pub fn count(&self) -> u64 { self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 } + + /// As [`update`](Hasher::update), but reading from a + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation. + /// + /// [`Hasher`] implements + /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to + /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`] + /// from any reader. Unfortunately, this standard approach can limit performance, because + /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of + /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) + /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly + /// more convenient. + /// + /// The internal buffer size this method uses may change at any time, and it may be different + /// for different targets. The only guarantee is that it will be large enough for all of this + /// crate's SIMD implementations on the current platform. + /// + /// The most common implementer of + /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be + /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory + /// mapping can be faster than this method for hashing large files. See + /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon), + /// which require the `mmap` and (for the latter) `rayon` Cargo features. + /// + /// This method requires the `std` Cargo feature, which is enabled by default. + /// + /// # Example + /// + /// ```no_run + /// # use std::fs::File; + /// # use std::io; + /// # fn main() -> io::Result<()> { + /// // Hash standard input. + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_reader(std::io::stdin().lock())?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "std")] + pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> { + io::copy_wide(reader, self)?; + Ok(self) + } + + /// As [`update`](Hasher::update), but using Rayon-based multithreading + /// internally. + /// + /// This method is gated by the `rayon` Cargo feature, which is disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// To get any performance benefit from multithreading, the input buffer + /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is + /// _slower_ than `update` for inputs under 128 KiB. That threshold varies + /// quite a lot across different processors, and it's important to benchmark + /// your specific use case. See also the performance warning associated with + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon). + /// + /// If you already have a large buffer in memory, and you want to hash it + /// with multiple threads, this method is a good option. However, reading a + /// file into memory just to call this method can be a performance mistake, + /// both because it requires lots of memory and because single-threaded + /// reads can be slow. For hashing whole files, see + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both + /// the `rayon` and `mmap` Cargo features. + #[cfg(feature = "rayon")] + pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::(input) + } + + /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping. + /// + /// Not all files can be memory mapped, and memory mapping small files can be slower than + /// reading them the usual way. In those cases, this method will fall back to standard file IO. + /// The heuristic for whether to use memory mapping is currently very simple (file size >= + /// 16 KiB), and it might change at any time. + /// + /// Like [`update`](Hasher::update), this method is single-threaded. In this author's + /// experience, memory mapping improves single-threaded performance by ~10% for large files + /// that are already in cache. This probably varies between platforms, and as always it's a + /// good idea to benchmark your own use case. In comparison, the multithreaded + /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on + /// performance. + /// + /// There's a correctness reason that this method takes + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of + /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped + /// file ignores the seek position of the original file handle (it neither respects the current + /// position nor updates the position). This difference in behavior would've caused + /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and + /// have different side effects in some cases. Taking a + /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by + /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is + /// opened internally. + /// + /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on + /// [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// let path = Path::new("file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap(path)?; + /// println!("{}", hasher.finalize()); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + pub fn update_mmap(&mut self, path: impl AsRef) -> std::io::Result<&mut Self> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(self) + } + + /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using + /// memory mapping. This is the default behavior of `b3sum`. + /// + /// For large files that are likely to be in cache, this can be much faster than + /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other + /// cryptographic hashes, this is usually what they're measuring. However... + /// + /// **Performance Warning:** There are cases where multithreading hurts performance. The worst + /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31), + /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends + /// more time seeking around than reading data). Windows tends to be somewhat worse about this, + /// in part because it's less likely than Linux to keep very large files in cache. More + /// generally, if your CPU cores are already busy, then multithreading will add overhead + /// without improving performance. If your code runs in different environments that you don't + /// control and can't measure, then unfortunately there's no one-size-fits-all answer for + /// whether multithreading is a good idea. + /// + /// The memory mapping behavior of this function is the same as + /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard + /// file IO might change at any time. + /// + /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// # Example + /// + /// ```no_run + /// # use std::io; + /// # use std::path::Path; + /// # fn main() -> io::Result<()> { + /// # #[cfg(feature = "rayon")] + /// # { + /// let path = Path::new("big_file.dat"); + /// let mut hasher = blake3::Hasher::new(); + /// hasher.update_mmap_rayon(path)?; + /// println!("{}", hasher.finalize()); + /// # } + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "mmap")] + #[cfg(feature = "rayon")] + pub fn update_mmap_rayon( + &mut self, + path: impl AsRef, + ) -> std::io::Result<&mut Self> { + let file = std::fs::File::open(path.as_ref())?; + if let Some(mmap) = io::maybe_mmap_file(&file)? { + self.update_rayon(&mmap); + } else { + io::copy_wide(&file, self)?; + } + Ok(self) + } } // Don't derive(Debug), because the state may be secret. @@ -1366,6 +1555,7 @@ impl std::io::Write for Hasher { /// from an unknown position in the output stream to recover its block index. Callers with strong /// secret keys aren't affected in practice, but secret offsets are a [design /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. +#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] #[derive(Clone)] pub struct OutputReader { inner: Output, diff --git a/third-party/blake3/src/platform.rs b/third-party/blake3/src/platform.rs index 00058b16..79bc9a3f 100644 --- a/third-party/blake3/src/platform.rs +++ b/third-party/blake3/src/platform.rs @@ -56,6 +56,11 @@ pub enum Platform { impl Platform { #[allow(unreachable_code)] pub fn detect() -> Self { + #[cfg(miri)] + { + return Platform::Portable; + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[cfg(blake3_avx512_ffi)] @@ -327,7 +332,12 @@ impl Platform { #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn avx512_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_avx512") { return false; @@ -349,7 +359,12 @@ pub fn avx512_detected() -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn avx2_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_avx2") { return false; @@ -371,7 +386,12 @@ pub fn avx2_detected() -> bool { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] +#[allow(unreachable_code)] pub fn sse41_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_sse41") { return false; @@ -395,6 +415,10 @@ pub fn sse41_detected() -> bool { #[inline(always)] #[allow(unreachable_code)] pub fn sse2_detected() -> bool { + if cfg!(miri) { + return false; + } + // A testing-only short-circuit. if cfg!(feature = "no_sse2") { return false; diff --git a/third-party/blake3/src/test.rs b/third-party/blake3/src/test.rs index 60bbe8cc..c76cbbc0 100644 --- a/third-party/blake3/src/test.rs +++ b/third-party/blake3/src/test.rs @@ -628,3 +628,211 @@ const fn test_hash_const_conversions() { let hash = crate::Hash::from_bytes(bytes); _ = hash.as_bytes(); } + +#[cfg(feature = "zeroize")] +#[test] +fn test_zeroize() { + use zeroize::Zeroize; + + let mut hash = crate::Hash([42; 32]); + hash.zeroize(); + assert_eq!(hash.0, [0u8; 32]); + + let mut hasher = crate::Hasher { + chunk_state: crate::ChunkState { + cv: [42; 8], + chunk_counter: 42, + buf: [42; 64], + buf_len: 42, + blocks_compressed: 42, + flags: 42, + platform: crate::Platform::Portable, + }, + key: [42; 8], + cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(), + }; + hasher.zeroize(); + assert_eq!(hasher.chunk_state.cv, [0; 8]); + assert_eq!(hasher.chunk_state.chunk_counter, 0); + assert_eq!(hasher.chunk_state.buf, [0; 64]); + assert_eq!(hasher.chunk_state.buf_len, 0); + assert_eq!(hasher.chunk_state.blocks_compressed, 0); + assert_eq!(hasher.chunk_state.flags, 0); + assert!(matches!( + hasher.chunk_state.platform, + crate::Platform::Portable + )); + assert_eq!(hasher.key, [0; 8]); + assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); + + let mut output_reader = crate::OutputReader { + inner: crate::Output { + input_chaining_value: [42; 8], + block: [42; 64], + counter: 42, + block_len: 42, + flags: 42, + platform: crate::Platform::Portable, + }, + position_within_block: 42, + }; + + output_reader.zeroize(); + assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); + assert_eq!(output_reader.inner.block, [0; 64]); + assert_eq!(output_reader.inner.counter, 0); + assert_eq!(output_reader.inner.block_len, 0); + assert_eq!(output_reader.inner.flags, 0); + assert!(matches!( + output_reader.inner.platform, + crate::Platform::Portable + )); + assert_eq!(output_reader.position_within_block, 0); +} + +#[test] +#[cfg(feature = "std")] +fn test_update_reader() -> Result<(), std::io::Error> { + // This is a brief test, since update_reader() is mostly a wrapper around update(), which already + // has substantial testing. + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + assert_eq!( + crate::Hasher::new().update_reader(&input[..])?.finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "std")] +fn test_update_reader_interrupted() -> std::io::Result<()> { + use std::io; + struct InterruptingReader<'a> { + already_interrupted: bool, + slice: &'a [u8], + } + impl<'a> InterruptingReader<'a> { + fn new(slice: &'a [u8]) -> Self { + Self { + already_interrupted: false, + slice, + } + } + } + impl<'a> io::Read for InterruptingReader<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if !self.already_interrupted { + self.already_interrupted = true; + return Err(io::Error::from(io::ErrorKind::Interrupted)); + } + let take = std::cmp::min(self.slice.len(), buf.len()); + buf[..take].copy_from_slice(&self.slice[..take]); + self.slice = &self.slice[take..]; + Ok(take) + } + } + + let input = b"hello world"; + let mut reader = InterruptingReader::new(input); + let mut hasher = crate::Hasher::new(); + hasher.update_reader(&mut reader)?; + assert_eq!(hasher.finalize(), crate::hash(input)); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +// NamedTempFile isn't Miri-compatible +#[cfg(not(miri))] +fn test_mmap() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already + // has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + assert_eq!( + crate::Hasher::new() + .update_mmap(tempfile.path())? + .finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(target_os = "linux")] +fn test_mmap_virtual_file() -> Result<(), std::io::Error> { + // Virtual files like /proc/version can't be mmapped, because their contents don't actually + // exist anywhere in memory. Make sure we fall back to regular file IO in these cases. + // Currently this is handled with a length check, where the assumption is that virtual files + // will always report length 0. If that assumption ever breaks, hopefully this test will catch + // it. + let virtual_filepath = "/proc/version"; + let mut mmap_hasher = crate::Hasher::new(); + // We'll fail right here if the fallback doesn't work. + mmap_hasher.update_mmap(virtual_filepath)?; + let mut read_hasher = crate::Hasher::new(); + read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?; + assert_eq!(mmap_hasher.finalize(), read_hasher.finalize()); + Ok(()) +} + +#[test] +#[cfg(feature = "mmap")] +#[cfg(feature = "rayon")] +// NamedTempFile isn't Miri-compatible +#[cfg(not(miri))] +fn test_mmap_rayon() -> Result<(), std::io::Error> { + // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(), + // which already has substantial testing. + use std::io::prelude::*; + let mut input = vec![0; 1_000_000]; + paint_test_input(&mut input); + let mut tempfile = tempfile::NamedTempFile::new()?; + tempfile.write_all(&input)?; + tempfile.flush()?; + assert_eq!( + crate::Hasher::new() + .update_mmap_rayon(tempfile.path())? + .finalize(), + crate::hash(&input), + ); + Ok(()) +} + +#[test] +#[cfg(feature = "std")] +#[cfg(feature = "serde")] +fn test_serde() { + let hash: crate::Hash = [7; 32].into(); + let json = serde_json::to_string(&hash).unwrap(); + assert_eq!( + json, + "[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]", + ); + let hash2: crate::Hash = serde_json::from_str(&json).unwrap(); + assert_eq!(hash, hash2); +} + +// `cargo +nightly miri test` currently works, but it takes forever, because some of our test +// inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri +// anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming +// they don't use incompatible features like Rayon or mmap. This test should get reasonable +// coverage of our public API without using any large inputs, so we can run it in CI and catch +// obvious breaks. (For example, constant_time_eq is not compatible with Miri.) +#[test] +fn test_miri_smoketest() { + let mut hasher = crate::Hasher::new_derive_key("Miri smoketest"); + hasher.update(b"foo"); + #[cfg(feature = "std")] + hasher.update_reader(&b"bar"[..]).unwrap(); + assert_eq!(hasher.finalize(), hasher.finalize()); + let mut reader = hasher.finalize_xof(); + reader.set_position(999999); + reader.fill(&mut [0]); +} diff --git a/third-party/blake3/tools/release.md b/third-party/blake3/tools/release.md index 17a07b0f..924f3279 100644 --- a/third-party/blake3/tools/release.md +++ b/third-party/blake3/tools/release.md @@ -4,7 +4,7 @@ - Bump the version in the root Cargo.toml. - Bump the version in b3sum/Cargo.toml. - Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar. -- Update the `--help` output in b3sum/README.md if it's changed. +- Update the `-h` output in b3sum/README.md if it's changed. - Bump `BLAKE3_VERSION_STRING` in c/blake3.h. - Bump `VERSION` in c/CMakeLists.txt. - Make a version bump commit with change notes.