From def379ce3f216613b0ca60f0f849ef129f8678a3 Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Mon, 13 May 2024 15:22:05 +0200 Subject: [PATCH] LibCrypto: Move some data around earlier in GHash to make it go faster This makes galois_multiply() about 10% faster. --- Tests/LibCrypto/TestAES.cpp | 11 +++++++++++ .../Libraries/LibCrypto/Authentication/GHash.cpp | 12 ++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/Tests/LibCrypto/TestAES.cpp b/Tests/LibCrypto/TestAES.cpp index c20ab686ed..7e1dab571b 100644 --- a/Tests/LibCrypto/TestAES.cpp +++ b/Tests/LibCrypto/TestAES.cpp @@ -336,6 +336,17 @@ TEST_CASE(test_AES_CTR_128bit_decrypt_16bytes) // If encryption works, then decryption works, too. } +BENCHMARK_CASE(GCM) +{ + Crypto::Authentication::GHash ghash("WellHelloFriends"_b); + auto v = ByteBuffer::create_uninitialized(16 * MiB).release_value(); + fill_with_random(v); + for (size_t i = 0; i < 10; ++i) { + ghash.process(v, "test"_b); + AK::taint_for_optimizer(v); + } +} + TEST_CASE(test_AES_GCM_name) { Crypto::Cipher::AESCipher::GCMMode cipher("WellHelloFriends"_b, 128, Crypto::Cipher::Intent::Encryption); diff --git a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp index 1ca2013f41..a40a6026d9 100644 --- a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp +++ b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp @@ -86,13 +86,15 @@ GHash::TagType GHash::process(ReadonlyBytes aad, ReadonlyBytes cipher) /// Galois Field multiplication using . /// Note that x, y, and z are strictly BE. -void galois_multiply(u32 (&z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) +void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) { + // Note: Copied upfront to stack to avoid memory access in the loop. u32 x[4] { _x[0], _x[1], _x[2], _x[3] }; - u32 y[4] { _y[0], _y[1], _y[2], _y[3] }; - __builtin_memset(z, 0, sizeof(z)); + u32 const y[4] { _y[0], _y[1], _y[2], _y[3] }; + u32 z[4] { 0, 0, 0, 0 }; -#pragma GCC unroll 16 + // Unrolled by 32, the access in y[3-(i/32)] can be cached throughout the loop. +#pragma GCC unroll 32 for (ssize_t i = 127; i > -1; --i) { auto r = -((y[3 - (i / 32)] >> (i % 32)) & 1); z[0] ^= x[0] & r; @@ -113,6 +115,8 @@ void galois_multiply(u32 (&z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) x[0] ^= 0xe1000000 & -a3; } + + memcpy(_z, z, sizeof(z)); } }