diff --git a/outside/scrypt/Makefile b/outside/scrypt/Makefile index f3ac5acb9..6185ce200 100644 --- a/outside/scrypt/Makefile +++ b/outside/scrypt/Makefile @@ -1,13 +1,13 @@ default: all CC?=gcc -CFLAGS?=-O2 -msse3 -ffast-math \ +CFLAGS?=-O2 -ffast-math \ -Wall -g -D_FORTIFY_SOURCE=2 -fPIC CFLAGS_EXTRA?=-Wl,-rpath=. all: scrypt.a -OBJS= crypto_scrypt-sse.o sha256.o +OBJS= crypto_scrypt-nosse.o sha256.o scrypt.a: $(OBJS) ar rcs scrypt.a $(OBJS) diff --git a/outside/scrypt/crypto_scrypt-sse.c b/outside/scrypt/crypto_scrypt-nosse.c similarity index 69% rename from outside/scrypt/crypto_scrypt-sse.c rename to outside/scrypt/crypto_scrypt-nosse.c index d7b33c078..d62e3ab4f 100644 --- a/outside/scrypt/crypto_scrypt-sse.c +++ b/outside/scrypt/crypto_scrypt-nosse.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include @@ -43,9 +42,9 @@ static void blkcpy(void * dest, void * src, size_t len) { - __m128i * D = dest; - __m128i * S = src; - size_t L = len / 16; + size_t * D = dest; + size_t * S = src; + size_t L = len / sizeof(size_t); size_t i; for (i = 0; i < L; i++) @@ -55,13 +54,13 @@ blkcpy(void * dest, void * src, size_t len) static void blkxor(void * dest, void * src, size_t len) { - __m128i * D = dest; - __m128i * S = src; - size_t L = len / 16; + size_t * D = dest; + size_t * S = src; + size_t L = len / sizeof(size_t); size_t i; for (i = 0; i < L; i++) - D[i] = _mm_xor_si128(D[i], S[i]); + D[i] ^= S[i]; } /** @@ -69,61 +68,43 @@ blkxor(void * dest, void * src, size_t len) * Apply the salsa20/8 core to the provided block. */ static void -salsa20_8(__m128i B[4]) +salsa20_8(uint32_t B[16]) { - __m128i X0, X1, X2, X3; - __m128i T; + uint32_t x[16]; size_t i; - X0 = B[0]; - X1 = B[1]; - X2 = B[2]; - X3 = B[3]; - + blkcpy(x, B, 64); for (i = 0; i < 8; i += 2) { - /* Operate on "columns". */ - T = _mm_add_epi32(X0, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X1, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X3, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); - /* Rearrange data. */ - X1 = _mm_shuffle_epi32(X1, 0x93); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x39); + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); - /* Operate on "rows". */ - T = _mm_add_epi32(X0, X1); - X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); - X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X3, X0); - X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); - X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X3); - X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); - X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X1, X2); - X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); - X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); - /* Rearrange data. */ - X1 = _mm_shuffle_epi32(X1, 0x39); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x93); + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows. */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R } - - B[0] = _mm_add_epi32(B[0], X0); - B[1] = _mm_add_epi32(B[1], X1); - B[2] = _mm_add_epi32(B[2], X2); - B[3] = _mm_add_epi32(B[3], X3); + for (i = 0; i < 16; i++) + B[i] += x[i]; } /** @@ -133,30 +114,30 @@ salsa20_8(__m128i B[4]) * temporary space X must be 64 bytes. */ static void -blockmix_salsa8(__m128i * Bin, __m128i * Bout, __m128i * X, size_t r) +blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r) { size_t i; /* 1: X <-- B_{2r - 1} */ - blkcpy(X, &Bin[8 * r - 4], 64); + blkcpy(X, &Bin[(2 * r - 1) * 16], 64); /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < r; i++) { + for (i = 0; i < 2 * r; i += 2) { /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8], 64); + blkxor(X, &Bin[i * 16], 64); salsa20_8(X); /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[i * 4], X, 64); + blkcpy(&Bout[i * 8], X, 64); /* 3: X <-- H(X \xor B_i) */ - blkxor(X, &Bin[i * 8 + 4], 64); + blkxor(X, &Bin[i * 16 + 16], 64); salsa20_8(X); /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy(&Bout[(r + i) * 4], X, 64); + blkcpy(&Bout[i * 8 + r * 16], X, 64); } } @@ -169,7 +150,7 @@ integerify(void * B, size_t r) { uint32_t * X = (void *)((uintptr_t)(B) + (2 * r - 1) * 64); - return (((uint64_t)(X[13]) << 32) + X[0]); + return (((uint64_t)(X[1]) << 32) + X[0]); } /** @@ -181,34 +162,29 @@ integerify(void * B, size_t r) * multiple of 64 bytes. */ void -smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY) +smix(uint8_t * B, size_t r, uint64_t N, uint32_t * V, uint32_t * XY) { - __m128i * X = XY; - __m128i * Y = (void *)((uintptr_t)(XY) + 128 * r); - __m128i * Z = (void *)((uintptr_t)(XY) + 256 * r); - uint32_t * X32 = (void *)X; - uint64_t i, j; + uint32_t * X = XY; + uint32_t * Y = &XY[32 * r]; + uint32_t * Z = &XY[64 * r]; + uint64_t i; + uint64_t j; size_t k; /* 1: X <-- B */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - X32[k * 16 + i] = - le32dec(&B[(k * 16 + (i * 5 % 16)) * 4]); - } - } + for (k = 0; k < 32 * r; k++) + X[k] = le32dec(&B[4 * k]); /* 2: for i = 0 to N - 1 do */ for (i = 0; i < N; i += 2) { /* 3: V_i <-- X */ - blkcpy((void *)((uintptr_t)(V) + i * 128 * r), X, 128 * r); + blkcpy(&V[i * (32 * r)], X, 128 * r); /* 4: X <-- H(X) */ blockmix_salsa8(X, Y, Z, r); /* 3: V_i <-- X */ - blkcpy((void *)((uintptr_t)(V) + (i + 1) * 128 * r), - Y, 128 * r); + blkcpy(&V[(i + 1) * (32 * r)], Y, 128 * r); /* 4: X <-- H(X) */ blockmix_salsa8(Y, X, Z, r); @@ -220,24 +196,20 @@ smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY) j = integerify(X, r) & (N - 1); /* 8: X <-- H(X \xor V_j) */ - blkxor(X, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); + blkxor(X, &V[j * (32 * r)], 128 * r); blockmix_salsa8(X, Y, Z, r); /* 7: j <-- Integerify(X) mod N */ j = integerify(Y, r) & (N - 1); /* 8: X <-- H(X \xor V_j) */ - blkxor(Y, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); + blkxor(Y, &V[j * (32 * r)], 128 * r); blockmix_salsa8(Y, X, Z, r); } /* 10: B' <-- X */ - for (k = 0; k < 2 * r; k++) { - for (i = 0; i < 16; i++) { - le32enc(&B[(k * 16 + (i * 5 % 16)) * 4], - X32[k * 16 + i]); - } - } + for (k = 0; k < 32 * r; k++) + le32enc(&B[4 * k], X[k]); } /** @@ -277,7 +249,7 @@ crypto_scrypt(const uint8_t * passwd, size_t passwdlen, } if ((r > SIZE_MAX / 128 / p) || #if SIZE_MAX / 256 <= UINT32_MAX - (r > (SIZE_MAX - 64) / 256) || + (r > SIZE_MAX / 256) || #endif (N > SIZE_MAX / 128 / r)) { errno = ENOMEM; diff --git a/outside/scrypt/crypto_scrypt.h b/outside/scrypt/crypto_scrypt.h index b88be09d8..e2638c5f3 100644 --- a/outside/scrypt/crypto_scrypt.h +++ b/outside/scrypt/crypto_scrypt.h @@ -32,7 +32,7 @@ #include #include -void smix(uint8_t *, size_t, uint64_t, void *, void *); +void smix(uint8_t *, size_t, uint64_t, uint32_t *, uint32_t *); void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, uint64_t, uint8_t *, size_t);