mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Initial upload v3.4.7
This commit is contained in:
38
algo/argon2/ar2/sj/scrypt-jane-hash.h
Normal file
38
algo/argon2/ar2/sj/scrypt-jane-hash.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#if defined(SCRYPT_SKEIN512)
|
||||
#include "scrypt-jane-hash_skein512.h"
|
||||
#else
|
||||
#define SCRYPT_HASH "ERROR"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
static void scrypt_hash_init(scrypt_hash_state *S) {}
|
||||
static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
|
||||
static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
|
||||
#error must define a hash function!
|
||||
#endif
|
||||
|
||||
#include "scrypt-jane-pbkdf2.h"
|
||||
|
||||
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
|
||||
|
||||
static int
|
||||
scrypt_test_hash(void) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_digest hash, final;
|
||||
uint8_t msg[SCRYPT_TEST_HASH_LEN];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
|
||||
msg[i] = (uint8_t)i;
|
||||
|
||||
scrypt_hash_init(&st);
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
|
||||
scrypt_hash(hash, msg, i);
|
||||
scrypt_hash_update(&st, hash, sizeof(hash));
|
||||
}
|
||||
scrypt_hash_finish(&st, final);
|
||||
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
|
||||
}
|
||||
|
188
algo/argon2/ar2/sj/scrypt-jane-hash_skein512.h
Normal file
188
algo/argon2/ar2/sj/scrypt-jane-hash_skein512.h
Normal file
@@ -0,0 +1,188 @@
|
||||
#define SCRYPT_HASH "Skein-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t X[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static void
|
||||
skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
|
||||
uint64_t X[8], key[8], Xt[9+18], T[3+1];
|
||||
size_t r;
|
||||
|
||||
while (blocks--) {
|
||||
T[0] = S->T[0] + add;
|
||||
T[1] = S->T[1];
|
||||
T[2] = T[0] ^ T[1];
|
||||
key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
|
||||
key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
|
||||
key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
|
||||
key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
|
||||
key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
|
||||
key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
|
||||
key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
|
||||
key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
|
||||
Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
|
||||
for (r = 0; r < 18; r++)
|
||||
Xt[r + 9] = Xt[r + 0];
|
||||
|
||||
for (r = 0; r < 18; r += 2) {
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 1];
|
||||
X[1] += Xt[r + 2];
|
||||
X[2] += Xt[r + 3];
|
||||
X[3] += Xt[r + 4];
|
||||
X[4] += Xt[r + 5];
|
||||
X[5] += Xt[r + 6] + T[1];
|
||||
X[6] += Xt[r + 7] + T[2];
|
||||
X[7] += Xt[r + 8] + r + 1;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 2];
|
||||
X[1] += Xt[r + 3];
|
||||
X[2] += Xt[r + 4];
|
||||
X[3] += Xt[r + 5];
|
||||
X[4] += Xt[r + 6];
|
||||
X[5] += Xt[r + 7] + T[1];
|
||||
X[6] += Xt[r + 8] + T[2];
|
||||
X[7] += Xt[r + 9] + r + 2;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
}
|
||||
|
||||
S->X[0] = key[0] ^ X[0];
|
||||
S->X[1] = key[1] ^ X[1];
|
||||
S->X[2] = key[2] ^ X[2];
|
||||
S->X[3] = key[3] ^ X[3];
|
||||
S->X[4] = key[4] ^ X[4];
|
||||
S->X[5] = key[5] ^ X[5];
|
||||
S->X[6] = key[6] ^ X[6];
|
||||
S->X[7] = key[7] ^ X[7];
|
||||
|
||||
S->T[0] = T[0];
|
||||
S->T[1] = T[1] & ~0x4000000000000000ull;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->X[0] = 0x4903ADFF749C51CEull;
|
||||
S->X[1] = 0x0D95DE399746DF03ull;
|
||||
S->X[2] = 0x8FD1934127C79BCEull;
|
||||
S->X[3] = 0x9A255629FF352CB1ull;
|
||||
S->X[4] = 0x5DB62599DF6CA7B0ull;
|
||||
S->X[5] = 0xEABE394CA9D5C3F4ull;
|
||||
S->X[6] = 0x991112C71A75B523ull;
|
||||
S->X[7] = 0xAE18A40B660FCC33ull;
|
||||
S->T[0] = 0x0000000000000000ull;
|
||||
S->T[1] = 0x7000000000000000ull;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
|
||||
if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* handle the previous data, we know there is enough for at least one block */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
in += want;
|
||||
inlen -= want;
|
||||
S->leftover = 0;
|
||||
skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/* handle the current data if there's more than one block */
|
||||
if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
|
||||
inlen -= blocks;
|
||||
in += blocks;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
memcpy(S->buffer + S->leftover, in, inlen);
|
||||
S->leftover += (int) inlen;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
S->T[1] |= 0x8000000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, S->leftover);
|
||||
|
||||
memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0xff00000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, 8);
|
||||
|
||||
U64TO8_LE(&hash[ 0], S->X[0]);
|
||||
U64TO8_LE(&hash[ 8], S->X[1]);
|
||||
U64TO8_LE(&hash[16], S->X[2]);
|
||||
U64TO8_LE(&hash[24], S->X[3]);
|
||||
U64TO8_LE(&hash[32], S->X[4]);
|
||||
U64TO8_LE(&hash[40], S->X[5]);
|
||||
U64TO8_LE(&hash[48], S->X[6]);
|
||||
U64TO8_LE(&hash[56], S->X[7]);
|
||||
}
|
||||
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
|
||||
0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
|
||||
0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
|
||||
0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
|
||||
};
|
367
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx.h
Normal file
367
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx.h
Normal file
@@ -0,0 +1,367 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_avx_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vpsrlq xmm8, xmm10, 51)
|
||||
a3(vpsrlq xmm9, xmm11, 51)
|
||||
a3(vpsllq xmm10, xmm10, 13)
|
||||
a3(vpsllq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm8)
|
||||
a3(vpxor xmm5, xmm5, xmm9)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vpsrlq xmm10, xmm8, 25)
|
||||
a3(vpsrlq xmm11, xmm9, 25)
|
||||
a3(vpsllq xmm8, xmm8, 39)
|
||||
a3(vpsllq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vpsrlq xmm10, xmm8, 51)
|
||||
a3(vpsrlq xmm11, xmm9, 51)
|
||||
a3(vpsllq xmm8, xmm8, 13)
|
||||
a3(vpsllq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm10)
|
||||
a3(vpxor xmm4, xmm4, xmm11)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vpsrlq xmm8, xmm10, 25)
|
||||
a3(vpsrlq xmm9, xmm11, 25)
|
||||
a3(vpsllq xmm10, xmm10, 39)
|
||||
a3(vpsllq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa64_avx_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
221
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
Normal file
221
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
Normal file
@@ -0,0 +1,221 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx2)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa ymm0,[rax+0])
|
||||
a2(vmovdqa ymm1,[rax+32])
|
||||
a2(vmovdqa ymm2,[rax+64])
|
||||
a2(vmovdqa ymm3,[rax+96])
|
||||
aj(jz scrypt_ChunkMix_avx2_no_xor1)
|
||||
a3(vpxor ymm0,ymm0,[r9+0])
|
||||
a3(vpxor ymm1,ymm1,[r9+32])
|
||||
a3(vpxor ymm2,ymm2,[r9+64])
|
||||
a3(vpxor ymm3,ymm3,[r9+96])
|
||||
a1(scrypt_ChunkMix_avx2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor ymm0,ymm0,[rsi+r9+0])
|
||||
a3(vpxor ymm1,ymm1,[rsi+r9+32])
|
||||
a3(vpxor ymm2,ymm2,[rsi+r9+64])
|
||||
a3(vpxor ymm3,ymm3,[rsi+r9+96])
|
||||
aj(jz scrypt_ChunkMix_avx2_no_xor2)
|
||||
a3(vpxor ymm0,ymm0,[rdx+r9+0])
|
||||
a3(vpxor ymm1,ymm1,[rdx+r9+32])
|
||||
a3(vpxor ymm2,ymm2,[rdx+r9+64])
|
||||
a3(vpxor ymm3,ymm3,[rdx+r9+96])
|
||||
a1(scrypt_ChunkMix_avx2_no_xor2:)
|
||||
a2(vmovdqa ymm6,ymm0)
|
||||
a2(vmovdqa ymm7,ymm1)
|
||||
a2(vmovdqa ymm8,ymm2)
|
||||
a2(vmovdqa ymm9,ymm3)
|
||||
a2(mov rax,4)
|
||||
a1(scrypt_salsa64_avx2_loop: )
|
||||
a3(vpaddq ymm4, ymm1, ymm0)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpxor ymm3, ymm3, ymm4)
|
||||
a3(vpaddq ymm4, ymm0, ymm3)
|
||||
a3(vpsrlq ymm5, ymm4, 51)
|
||||
a3(vpxor ymm2, ymm2, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 13)
|
||||
a3(vpxor ymm2, ymm2, ymm4)
|
||||
a3(vpaddq ymm4, ymm3, ymm2)
|
||||
a3(vpsrlq ymm5, ymm4, 25)
|
||||
a3(vpxor ymm1, ymm1, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 39)
|
||||
a3(vpxor ymm1, ymm1, ymm4)
|
||||
a3(vpaddq ymm4, ymm2, ymm1)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpermq ymm1, ymm1, 0x39)
|
||||
a3(vpermq ymm10, ymm2, 0x4e)
|
||||
a3(vpxor ymm0, ymm0, ymm4)
|
||||
a3(vpermq ymm3, ymm3, 0x93)
|
||||
a3(vpaddq ymm4, ymm3, ymm0)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpxor ymm1, ymm1, ymm4)
|
||||
a3(vpaddq ymm4, ymm0, ymm1)
|
||||
a3(vpsrlq ymm5, ymm4, 51)
|
||||
a3(vpxor ymm10, ymm10, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 13)
|
||||
a3(vpxor ymm10, ymm10, ymm4)
|
||||
a3(vpaddq ymm4, ymm1, ymm10)
|
||||
a3(vpsrlq ymm5, ymm4, 25)
|
||||
a3(vpxor ymm3, ymm3, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 39)
|
||||
a3(vpermq ymm1, ymm1, 0x93)
|
||||
a3(vpxor ymm3, ymm3, ymm4)
|
||||
a3(vpermq ymm2, ymm10, 0x4e)
|
||||
a3(vpaddq ymm4, ymm10, ymm3)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpermq ymm3, ymm3, 0x39)
|
||||
a3(vpxor ymm0, ymm0, ymm4)
|
||||
a1(dec rax)
|
||||
aj(jnz scrypt_salsa64_avx2_loop)
|
||||
a3(vpaddq ymm0,ymm0,ymm6)
|
||||
a3(vpaddq ymm1,ymm1,ymm7)
|
||||
a3(vpaddq ymm2,ymm2,ymm8)
|
||||
a3(vpaddq ymm3,ymm3,ymm9)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],ymm0)
|
||||
a2(vmovdqa [rax+32],ymm1)
|
||||
a2(vmovdqa [rax+64],ymm2)
|
||||
a2(vmovdqa [rax+96],ymm3)
|
||||
aj(jne scrypt_ChunkMix_avx2_loop)
|
||||
a1(vzeroupper)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
y0 = ymmp[0];
|
||||
y1 = ymmp[1];
|
||||
y2 = ymmp[2];
|
||||
y3 = ymmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
ymmp = (ymmi *)scrypt_block(Bin, i);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
ymmp = (ymmi *)scrypt_block(Bxor, i);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
}
|
||||
|
||||
t0 = y0;
|
||||
t1 = y1;
|
||||
t2 = y2;
|
||||
t3 = y3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm256_add_epi64(y0, y1);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y3 = _mm256_xor_si256(y3, z0);
|
||||
z0 = _mm256_add_epi64(y3, y0);
|
||||
z1 = _mm256_srli_epi64(z0, 64-13);
|
||||
y2 = _mm256_xor_si256(y2, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 13);
|
||||
y2 = _mm256_xor_si256(y2, z0);
|
||||
z0 = _mm256_add_epi64(y2, y3);
|
||||
z1 = _mm256_srli_epi64(z0, 64-39);
|
||||
y1 = _mm256_xor_si256(y1, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 39);
|
||||
y1 = _mm256_xor_si256(y1, z0);
|
||||
y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1));
|
||||
y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
|
||||
y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3));
|
||||
z0 = _mm256_add_epi64(y1, y2);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y0 = _mm256_xor_si256(y0, z0);
|
||||
z0 = _mm256_add_epi64(y0, y3);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y1 = _mm256_xor_si256(y1, z0);
|
||||
z0 = _mm256_add_epi64(y1, y0);
|
||||
z1 = _mm256_srli_epi64(z0, 64-13);
|
||||
y2 = _mm256_xor_si256(y2, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 13);
|
||||
y2 = _mm256_xor_si256(y2, z0);
|
||||
z0 = _mm256_add_epi64(y2, y1);
|
||||
z1 = _mm256_srli_epi64(z0, 64-39);
|
||||
y3 = _mm256_xor_si256(y3, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 39);
|
||||
y3 = _mm256_xor_si256(y3, z0);
|
||||
z0 = _mm256_add_epi64(y3, y2);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y0 = _mm256_xor_si256(y0, z0);
|
||||
y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3));
|
||||
y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
|
||||
y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1));
|
||||
}
|
||||
|
||||
y0 = _mm256_add_epi64(y0, t0);
|
||||
y1 = _mm256_add_epi64(y1, t1);
|
||||
y2 = _mm256_add_epi64(y2, t2);
|
||||
y3 = _mm256_add_epi64(y3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
ymmp[0] = y0;
|
||||
ymmp[1] = y1;
|
||||
ymmp[2] = y2;
|
||||
ymmp[3] = y3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
449
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
Normal file
449
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
Normal file
@@ -0,0 +1,449 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_sse2_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
aj(ja scrypt_salsa64_sse2_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSE2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
|
||||
/* sse3/avx use this as well */
|
||||
#if defined(SCRYPT_SALSA64_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
|
||||
uint64_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
399
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
Normal file
399
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
Normal file
@@ -0,0 +1,399 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_ssse3_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
aj(ja scrypt_salsa64_ssse3_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSSE3"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
335
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-xop.h
Normal file
335
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-xop.h
Normal file
@@ -0,0 +1,335 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_XOP
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_xop)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_xop_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_xop_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_xop_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_xop_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vprotq xmm10, xmm10, 13)
|
||||
a3(vprotq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vprotq xmm8, xmm8, 39)
|
||||
a3(vprotq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vprotq xmm8, xmm8, 13)
|
||||
a3(vprotq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vprotq xmm10, xmm10, 39)
|
||||
a3(vprotq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa64_xop_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_xop_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_xop)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_XOP
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z0 = _mm_roti_epi64(z0, 13);
|
||||
z1 = _mm_roti_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z0 = _mm_roti_epi64(z0, 39);
|
||||
z1 = _mm_roti_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z0 = _mm_roti_epi64(z0, 13);
|
||||
z1 = _mm_roti_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z0 = _mm_roti_epi64(z0, 39);
|
||||
z1 = _mm_roti_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-XOP"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
41
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64.h
Normal file
41
algo/argon2/ar2/sj/scrypt-jane-mix_salsa64.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_BASIC
|
||||
|
||||
static void
|
||||
salsa64_core_basic(uint64_t state[16]) {
|
||||
const size_t rounds = 8;
|
||||
uint64_t v[16], t;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) v[i] = state[i];
|
||||
|
||||
#define G(a,b,c,d) \
|
||||
t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
|
||||
t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
|
||||
t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
|
||||
t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
|
||||
|
||||
for (i = 0; i < rounds; i += 2) {
|
||||
G( 0, 4, 8,12);
|
||||
G( 5, 9,13, 1);
|
||||
G(10,14, 2, 6);
|
||||
G(15, 3, 7,11);
|
||||
G( 0, 1, 2, 3);
|
||||
G( 5, 6, 7, 4);
|
||||
G(10,11, 8, 9);
|
||||
G(15,12,13,14);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++) state[i] += v[i];
|
||||
|
||||
#undef G
|
||||
}
|
||||
|
||||
#endif
|
||||
|
112
algo/argon2/ar2/sj/scrypt-jane-pbkdf2.h
Normal file
112
algo/argon2/ar2/sj/scrypt-jane-pbkdf2.h
Normal file
@@ -0,0 +1,112 @@
|
||||
typedef struct scrypt_hmac_state_t {
|
||||
scrypt_hash_state inner, outer;
|
||||
} scrypt_hmac_state;
|
||||
|
||||
|
||||
static void
|
||||
scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_init(&st);
|
||||
scrypt_hash_update(&st, m, mlen);
|
||||
scrypt_hash_finish(&st, hash);
|
||||
}
|
||||
|
||||
/* hmac */
|
||||
static void
|
||||
scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
|
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
|
||||
size_t i;
|
||||
|
||||
scrypt_hash_init(&st->inner);
|
||||
scrypt_hash_init(&st->outer);
|
||||
|
||||
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* use the key directly if it's <= blocksize bytes */
|
||||
memcpy(pad, key, keylen);
|
||||
} else {
|
||||
/* if it's > blocksize bytes, hash it */
|
||||
scrypt_hash(pad, key, keylen);
|
||||
}
|
||||
|
||||
/* inner = (key ^ 0x36) */
|
||||
/* h(inner || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= 0x36;
|
||||
scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
/* outer = (key ^ 0x5c) */
|
||||
/* h(outer || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= (0x5c ^ 0x36);
|
||||
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
scrypt_ensure_zero(pad, sizeof(pad));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
|
||||
/* h(inner || m...) */
|
||||
scrypt_hash_update(&st->inner, m, mlen);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
|
||||
/* h(inner || m) */
|
||||
scrypt_hash_digest innerhash;
|
||||
scrypt_hash_finish(&st->inner, innerhash);
|
||||
|
||||
/* h(outer || h(inner || m)) */
|
||||
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
|
||||
scrypt_hash_finish(&st->outer, mac);
|
||||
|
||||
scrypt_ensure_zero(st, sizeof(*st));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
|
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
|
||||
scrypt_hash_digest ti, u;
|
||||
uint8_t be[4];
|
||||
uint32_t i, j, blocks;
|
||||
uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
|
||||
|
||||
/* hmac(password, ...) */
|
||||
scrypt_hmac_init(&hmac_pw, password, password_len);
|
||||
|
||||
/* hmac(password, salt...) */
|
||||
hmac_pw_salt = hmac_pw;
|
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
|
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
|
||||
for (i = 1; i <= blocks; i++) {
|
||||
/* U1 = hmac(password, salt || be(i)) */
|
||||
U32TO8_BE(be, i);
|
||||
work = hmac_pw_salt;
|
||||
scrypt_hmac_update(&work, be, 4);
|
||||
scrypt_hmac_finish(&work, ti);
|
||||
memcpy(u, ti, sizeof(u));
|
||||
|
||||
/* T[i] = U1 ^ U2 ^ U3... */
|
||||
for (c = 0; c < N - 1; c++) {
|
||||
/* UX = hmac(password, U{X-1}) */
|
||||
work = hmac_pw;
|
||||
scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
|
||||
scrypt_hmac_finish(&work, u);
|
||||
|
||||
/* T[i] ^= UX */
|
||||
for (j = 0; j < sizeof(u); j++)
|
||||
ti[j] ^= u[j];
|
||||
}
|
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
|
||||
out += SCRYPT_HASH_DIGEST_SIZE;
|
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE;
|
||||
}
|
||||
|
||||
scrypt_ensure_zero(ti, sizeof(ti));
|
||||
scrypt_ensure_zero(u, sizeof(u));
|
||||
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
|
||||
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
|
||||
}
|
463
algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
Normal file
463
algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
Normal file
@@ -0,0 +1,463 @@
|
||||
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
|
||||
#define X86ASM
|
||||
|
||||
/* gcc 2.95 royally screws up stack alignments on variables */
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
|
||||
#define X86ASM_SSE
|
||||
#define X86ASM_SSE2
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
|
||||
#define X86ASM_SSSE3
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
|
||||
#define X86ASM_AVX
|
||||
#define X86ASM_XOP
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
|
||||
#define X86ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) && defined(COMPILER_GCC)
|
||||
#define X86_64ASM
|
||||
#define X86_64ASM_SSE2
|
||||
#if (COMPILER_GCC >= 40102)
|
||||
#define X86_64ASM_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40400)
|
||||
#define X86_64ASM_AVX
|
||||
#define X86_64ASM_XOP
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40700)
|
||||
#define X86_64ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
|
||||
#define X86_INTRINSIC
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(__SSE__)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(__SSE2__)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if defined(__SSSE3__)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if defined(__AVX__)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if defined(__XOP__)
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* only use simd on windows (or SSE2 on gcc)! */
|
||||
#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
|
||||
#if defined(X86_INTRINSIC_SSE)
|
||||
#include <mmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
typedef __m64 qmm;
|
||||
typedef __m128 xmm;
|
||||
typedef __m128d xmmd;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#include <emmintrin.h>
|
||||
typedef __m128i xmmi;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_XOP)
|
||||
#if defined(COMPILER_MSVC)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX2)
|
||||
typedef __m256i ymmi;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
xmmi v;
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
xmmi v;
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
xmmi v;
|
||||
} packedelem64;
|
||||
#else
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
uint32_t dw[4];
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
uint8_t b[16];
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
uint8_t b[16];
|
||||
} packedelem64;
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
|
||||
#endif
|
||||
|
||||
/*
|
||||
x86 inline asm for gcc/msvc. usage:
|
||||
|
||||
asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
|
||||
asm_naked_fn(name)
|
||||
a1(..)
|
||||
a2(.., ..)
|
||||
a3(.., .., ..)
|
||||
64bit OR 0 paramters: a1(ret)
|
||||
32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
|
||||
asm_naked_fn_end(name)
|
||||
*/
|
||||
|
||||
#if defined(X86ASM) || defined(X86_64ASM)
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
|
||||
#define a1(x) __asm {x}
|
||||
#define a2(x, y) __asm {x, y}
|
||||
#define a3(x, y, z) __asm {x, y, z}
|
||||
#define a4(x, y, z, w) __asm {x, y, z, w}
|
||||
#define aj(x) __asm {x}
|
||||
#define asm_align8 a1(ALIGN 8)
|
||||
#define asm_align16 a1(ALIGN 16)
|
||||
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) {
|
||||
#define asm_naked_fn_end(fn) }
|
||||
#elif defined(COMPILER_GCC)
|
||||
#define GNU_AS1(x) #x ";\n"
|
||||
#define GNU_AS2(x, y) #x ", " #y ";\n"
|
||||
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
|
||||
#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
|
||||
#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
|
||||
#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
|
||||
|
||||
#define a1(x) GNU_AS1(x)
|
||||
#define a2(x, y) GNU_AS2(x, y)
|
||||
#define a3(x, y, z) GNU_AS3(x, y, z)
|
||||
#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
|
||||
#define aj(x) GNU_ASJ(x)
|
||||
#define asm_align8 ".p2align 3,,7"
|
||||
#define asm_align16 ".p2align 4,,15"
|
||||
|
||||
#if defined(OS_WINDOWS)
|
||||
#define asm_calling_convention CDECL
|
||||
#define aret(n) a1(ret)
|
||||
|
||||
#if defined(X86_64ASM)
|
||||
#define asm_naked_fn(fn) ; __asm__ ( \
|
||||
".text\n" \
|
||||
asm_align16 GNU_ASFN(fn) \
|
||||
"subq $136, %rsp;" \
|
||||
"movdqa %xmm6, 0(%rsp);" \
|
||||
"movdqa %xmm7, 16(%rsp);" \
|
||||
"movdqa %xmm8, 32(%rsp);" \
|
||||
"movdqa %xmm9, 48(%rsp);" \
|
||||
"movdqa %xmm10, 64(%rsp);" \
|
||||
"movdqa %xmm11, 80(%rsp);" \
|
||||
"movdqa %xmm12, 96(%rsp);" \
|
||||
"movq %rdi, 112(%rsp);" \
|
||||
"movq %rsi, 120(%rsp);" \
|
||||
"movq %rcx, %rdi;" \
|
||||
"movq %rdx, %rsi;" \
|
||||
"movq %r8, %rdx;" \
|
||||
"movq %r9, %rcx;" \
|
||||
"call 1f;" \
|
||||
"movdqa 0(%rsp), %xmm6;" \
|
||||
"movdqa 16(%rsp), %xmm7;" \
|
||||
"movdqa 32(%rsp), %xmm8;" \
|
||||
"movdqa 48(%rsp), %xmm9;" \
|
||||
"movdqa 64(%rsp), %xmm10;" \
|
||||
"movdqa 80(%rsp), %xmm11;" \
|
||||
"movdqa 96(%rsp), %xmm12;" \
|
||||
"movq 112(%rsp), %rdi;" \
|
||||
"movq 120(%rsp), %rsi;" \
|
||||
"addq $136, %rsp;" \
|
||||
"ret;" \
|
||||
".intel_syntax noprefix;" \
|
||||
".p2align 4,,15;" \
|
||||
"1:;"
|
||||
#else
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
#else
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
|
||||
#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
|
||||
|
||||
#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
|
||||
#define asm_gcc_parms() ".att_syntax prefix;"
|
||||
#define asm_gcc_trashed() __asm__ __volatile__("" :::
|
||||
#define asm_gcc_end() );
|
||||
#else
|
||||
need x86 asm
|
||||
#endif
|
||||
|
||||
#endif /* X86ASM || X86_64ASM */
|
||||
|
||||
|
||||
#if defined(CPU_X86) || defined(CPU_X86_64)
|
||||
|
||||
typedef enum cpu_flags_x86_t {
|
||||
cpu_mmx = 1 << 0,
|
||||
cpu_sse = 1 << 1,
|
||||
cpu_sse2 = 1 << 2,
|
||||
cpu_sse3 = 1 << 3,
|
||||
cpu_ssse3 = 1 << 4,
|
||||
cpu_sse4_1 = 1 << 5,
|
||||
cpu_sse4_2 = 1 << 6,
|
||||
cpu_avx = 1 << 7,
|
||||
cpu_xop = 1 << 8,
|
||||
cpu_avx2 = 1 << 9
|
||||
} cpu_flags_x86;
|
||||
|
||||
typedef enum cpu_vendors_x86_t {
|
||||
cpu_nobody,
|
||||
cpu_intel,
|
||||
cpu_amd
|
||||
} cpu_vendors_x86;
|
||||
|
||||
typedef struct x86_regs_t {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
} x86_regs;
|
||||
|
||||
#if defined(X86ASM)
|
||||
asm_naked_fn_proto(int, has_cpuid)(void)
|
||||
asm_naked_fn(has_cpuid)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(mov ecx, eax)
|
||||
a2(xor eax, 0x200000)
|
||||
a1(push eax)
|
||||
a1(popfd)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(xor eax, ecx)
|
||||
a2(shr eax, 21)
|
||||
a2(and eax, 1)
|
||||
a1(push ecx)
|
||||
a1(popfd)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(has_cpuid)
|
||||
#endif /* X86ASM */
|
||||
|
||||
|
||||
static void NOINLINE
|
||||
get_cpuid(x86_regs *regs, uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
__cpuid((int *)regs, (int)flags);
|
||||
#else
|
||||
#if defined(CPU_X86_64)
|
||||
#define cpuid_bx rbx
|
||||
#else
|
||||
#define cpuid_bx ebx
|
||||
#endif
|
||||
|
||||
asm_gcc()
|
||||
a1(push cpuid_bx)
|
||||
a2(xor ecx, ecx)
|
||||
a1(cpuid)
|
||||
a2(mov [%1 + 0], eax)
|
||||
a2(mov [%1 + 4], ebx)
|
||||
a2(mov [%1 + 8], ecx)
|
||||
a2(mov [%1 + 12], edx)
|
||||
a1(pop cpuid_bx)
|
||||
asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc"
|
||||
asm_gcc_end()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
static uint64_t NOINLINE
|
||||
get_xgetbv(uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
return _xgetbv(flags);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
asm_gcc()
|
||||
a1(xgetbv)
|
||||
asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
|
||||
asm_gcc_end()
|
||||
return ((uint64_t)lo | ((uint64_t)hi << 32));
|
||||
#endif
|
||||
}
|
||||
#endif // AVX support
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
size_t cpu_detect_mask = (size_t)-1;
|
||||
#endif
|
||||
|
||||
static size_t
|
||||
detect_cpu(void) {
|
||||
//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
//cpu_vendors_x86 vendor = cpu_nobody;
|
||||
x86_regs regs;
|
||||
uint32_t max_level, max_ext_level;
|
||||
size_t cpu_flags = 0;
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
uint64_t xgetbv_flags;
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86)
|
||||
if (!has_cpuid())
|
||||
return cpu_flags;
|
||||
#endif
|
||||
|
||||
get_cpuid(®s, 0);
|
||||
max_level = regs.eax;
|
||||
#if 0
|
||||
vendor_string.i[0] = regs.ebx;
|
||||
vendor_string.i[1] = regs.edx;
|
||||
vendor_string.i[2] = regs.ecx;
|
||||
|
||||
if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
|
||||
vendor = cpu_intel;
|
||||
else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
|
||||
vendor = cpu_amd;
|
||||
#endif
|
||||
if (max_level & 0x00000500) {
|
||||
/* "Intel P5 pre-B0" */
|
||||
cpu_flags |= cpu_mmx;
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
if (max_level < 1)
|
||||
return cpu_flags;
|
||||
|
||||
get_cpuid(®s, 1);
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
/* xsave/xrestore */
|
||||
if (regs.ecx & (1 << 27)) {
|
||||
xgetbv_flags = get_xgetbv(0);
|
||||
if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
|
||||
}
|
||||
#endif
|
||||
if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3;
|
||||
if (regs.ecx & (1 )) cpu_flags |= cpu_sse3;
|
||||
if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
|
||||
if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
|
||||
if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
|
||||
|
||||
if (cpu_flags & cpu_avx) {
|
||||
if (max_level >= 7) {
|
||||
get_cpuid(®s, 7);
|
||||
if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
|
||||
}
|
||||
|
||||
get_cpuid(®s, 0x80000000);
|
||||
max_ext_level = regs.eax;
|
||||
if (max_ext_level >= 0x80000001) {
|
||||
get_cpuid(®s, 0x80000001);
|
||||
if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
cpu_flags &= cpu_detect_mask;
|
||||
#endif
|
||||
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static const char *
|
||||
get_top_cpuflag_desc(size_t flag) {
|
||||
if (flag & cpu_avx2) return "AVX2";
|
||||
else if (flag & cpu_xop) return "XOP";
|
||||
else if (flag & cpu_avx) return "AVX";
|
||||
else if (flag & cpu_sse4_2) return "SSE4.2";
|
||||
else if (flag & cpu_sse4_1) return "SSE4.1";
|
||||
else if (flag & cpu_ssse3) return "SSSE3";
|
||||
else if (flag & cpu_sse2) return "SSE2";
|
||||
else if (flag & cpu_sse) return "SSE";
|
||||
else if (flag & cpu_mmx) return "MMX";
|
||||
else return "Basic";
|
||||
}
|
||||
#endif
|
||||
|
||||
/* enable the highest system-wide option */
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#if !defined(__AVX2__)
|
||||
#undef X86_64ASM_AVX2
|
||||
#undef X86ASM_AVX2
|
||||
#undef X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#if !defined(__XOP__)
|
||||
#undef X86_64ASM_XOP
|
||||
#undef X86ASM_XOP
|
||||
#undef X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if !defined(__AVX__)
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86ASM_AVX
|
||||
#undef X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if !defined(__SSSE3__)
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if !defined(__SSE2__)
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
|
310
algo/argon2/ar2/sj/scrypt-jane-portable.h
Normal file
310
algo/argon2/ar2/sj/scrypt-jane-portable.h
Normal file
@@ -0,0 +1,310 @@
|
||||
/* determine os */
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#include <wincrypt.h>
|
||||
#define OS_WINDOWS
|
||||
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_SOLARIS
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/param.h> /* need this to define BSD */
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_NIX
|
||||
#if defined(__linux__)
|
||||
#include <endian.h>
|
||||
#define OS_LINUX
|
||||
#elif defined(BSD)
|
||||
#define OS_BSD
|
||||
|
||||
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
|
||||
#define OS_OSX
|
||||
#elif defined(macintosh) || defined(Macintosh)
|
||||
#define OS_MAC
|
||||
#elif defined(__OpenBSD__)
|
||||
#define OS_OPENBSD
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* determine compiler */
|
||||
#if defined(_MSC_VER)
|
||||
#define COMPILER_MSVC_VS6 120000000
|
||||
#define COMPILER_MSVC_VS6PP 121000000
|
||||
#define COMPILER_MSVC_VS2002 130000000
|
||||
#define COMPILER_MSVC_VS2003 131000000
|
||||
#define COMPILER_MSVC_VS2005 140050727
|
||||
#define COMPILER_MSVC_VS2008 150000000
|
||||
#define COMPILER_MSVC_VS2008SP1 150030729
|
||||
#define COMPILER_MSVC_VS2010 160000000
|
||||
#define COMPILER_MSVC_VS2010SP1 160040219
|
||||
#define COMPILER_MSVC_VS2012RC 170000000
|
||||
#define COMPILER_MSVC_VS2012 170050727
|
||||
|
||||
#if _MSC_FULL_VER > 100000000
|
||||
#define COMPILER_MSVC (_MSC_FULL_VER)
|
||||
#else
|
||||
#define COMPILER_MSVC (_MSC_FULL_VER * 10)
|
||||
#endif
|
||||
|
||||
#if ((_MSC_VER == 1200) && defined(_mm_free))
|
||||
#undef COMPILER_MSVC
|
||||
#define COMPILER_MSVC COMPILER_MSVC_VS6PP
|
||||
#endif
|
||||
|
||||
#pragma warning(disable : 4127) /* conditional expression is constant */
|
||||
#pragma warning(disable : 4100) /* unreferenced formal parameter */
|
||||
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#endif
|
||||
|
||||
#include <float.h>
|
||||
#include <stdlib.h> /* _rotl */
|
||||
#include <intrin.h>
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef signed __int64 int64_t;
|
||||
|
||||
#define ROTL32(a,b) _rotl(a,b)
|
||||
#define ROTR32(a,b) _rotr(a,b)
|
||||
#define ROTL64(a,b) _rotl64(a,b)
|
||||
#define ROTR64(a,b) _rotr64(a,b)
|
||||
#undef NOINLINE
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#undef NORETURN
|
||||
#define NORETURN
|
||||
#undef INLINE
|
||||
#define INLINE __forceinline
|
||||
#undef FASTCALL
|
||||
#define FASTCALL __fastcall
|
||||
#undef CDECL
|
||||
#define CDECL __cdecl
|
||||
#undef STDCALL
|
||||
#define STDCALL __stdcall
|
||||
#undef NAKED
|
||||
#define NAKED __declspec(naked)
|
||||
#define ALIGN(n) __declspec(align(n))
|
||||
#endif
|
||||
#if defined(__ICC)
|
||||
#define COMPILER_INTEL
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
#if (__GNUC__ >= 3)
|
||||
#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
|
||||
#else
|
||||
#define COMPILER_GCC_PATCHLEVEL 0
|
||||
#endif
|
||||
#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
|
||||
#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
|
||||
#undef NOINLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
#undef NORETURN
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NORETURN __attribute__((noreturn))
|
||||
#else
|
||||
#define NORETURN
|
||||
#endif
|
||||
#undef INLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define INLINE __attribute__((always_inline))
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#undef FASTCALL
|
||||
#if (COMPILER_GCC >= 30400)
|
||||
#define FASTCALL __attribute__((fastcall))
|
||||
#else
|
||||
#define FASTCALL
|
||||
#endif
|
||||
#undef CDECL
|
||||
#define CDECL __attribute__((cdecl))
|
||||
#undef STDCALL
|
||||
#define STDCALL __attribute__((stdcall))
|
||||
#define ALIGN(n) __attribute__((aligned(n)))
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__)
|
||||
#define COMPILER_MINGW
|
||||
#endif
|
||||
#if defined(__PATHCC__)
|
||||
#define COMPILER_PATHCC
|
||||
#endif
|
||||
|
||||
#define OPTIONAL_INLINE
|
||||
#if defined(OPTIONAL_INLINE)
|
||||
#undef OPTIONAL_INLINE
|
||||
#define OPTIONAL_INLINE INLINE
|
||||
#else
|
||||
#define OPTIONAL_INLINE
|
||||
#endif
|
||||
|
||||
#define CRYPTO_FN NOINLINE STDCALL
|
||||
|
||||
/* determine cpu */
|
||||
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
|
||||
#define CPU_X86_64
|
||||
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
|
||||
#define CPU_X86 500
|
||||
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
|
||||
#define CPU_X86 400
|
||||
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
|
||||
#define CPU_X86 300
|
||||
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
|
||||
#define CPU_IA64
|
||||
#endif
|
||||
|
||||
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
|
||||
#define CPU_SPARC
|
||||
#if defined(__sparcv9)
|
||||
#define CPU_SPARC64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
|
||||
#define CPU_64BITS
|
||||
#undef FASTCALL
|
||||
#define FASTCALL
|
||||
#undef CDECL
|
||||
#define CDECL
|
||||
#undef STDCALL
|
||||
#define STDCALL
|
||||
#endif
|
||||
|
||||
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
|
||||
#define CPU_PPC
|
||||
#if defined(_ARCH_PWR7)
|
||||
#define CPU_POWER7
|
||||
#elif defined(__64BIT__)
|
||||
#define CPU_PPC64
|
||||
#else
|
||||
#define CPU_PPC32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__hppa__) || defined(__hppa)
|
||||
#define CPU_HPPA
|
||||
#endif
|
||||
|
||||
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
|
||||
#define CPU_ALPHA
|
||||
#endif
|
||||
|
||||
/* endian */
|
||||
|
||||
#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
|
||||
(defined(CPU_X86) || defined(CPU_X86_64)) || \
|
||||
(defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
|
||||
#define CPU_LE
|
||||
#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
|
||||
(defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
|
||||
#define CPU_BE
|
||||
#else
|
||||
/* unknown endian! */
|
||||
#endif
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
|
||||
|
||||
#define U8TO32_LE(p) \
|
||||
(((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \
|
||||
((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
|
||||
|
||||
#define U32TO8_BE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
|
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
|
||||
|
||||
#define U32TO8_LE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \
|
||||
(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
|
||||
|
||||
#define U8TO64_BE(p) \
|
||||
(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
|
||||
|
||||
#define U8TO64_LE(p) \
|
||||
(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
|
||||
|
||||
#define U64TO8_BE(p, v) \
|
||||
U32TO8_BE((p), (uint32_t)((v) >> 32)); \
|
||||
U32TO8_BE((p) + 4, (uint32_t)((v) ));
|
||||
|
||||
#define U64TO8_LE(p, v) \
|
||||
U32TO8_LE((p), (uint32_t)((v) )); \
|
||||
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
|
||||
|
||||
#define U32_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \
|
||||
(v) = ((v) << 16) | ((v) >> 16); \
|
||||
}
|
||||
|
||||
#define U64_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \
|
||||
(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \
|
||||
(v) = ((v) << 32) | ((v) >> 32); \
|
||||
}
|
||||
|
||||
static int
|
||||
scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
|
||||
uint32_t differentbits = 0;
|
||||
while (len--)
|
||||
differentbits |= (*x++ ^ *y++);
|
||||
return (1 & ((differentbits - 1) >> 8));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_ensure_zero(void *p, size_t len) {
|
||||
#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
|
||||
__stosb((unsigned char *)p, 0, len);
|
||||
#elif (defined(CPU_X86) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushl %%edi;\n"
|
||||
"pushl %%ecx;\n"
|
||||
"rep stosb;\n"
|
||||
"popl %%ecx;\n"
|
||||
"popl %%edi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushq %%rdi;\n"
|
||||
"pushq %%rcx;\n"
|
||||
"rep stosb;\n"
|
||||
"popq %%rcx;\n"
|
||||
"popq %%rdi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#else
|
||||
volatile uint8_t *b = (volatile uint8_t *)p;
|
||||
size_t i;
|
||||
for (i = 0; i < len; i++)
|
||||
b[i] = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "scrypt-jane-portable-x86.h"
|
||||
|
||||
#if !defined(asm_calling_convention)
|
||||
#define asm_calling_convention
|
||||
#endif
|
74
algo/argon2/ar2/sj/scrypt-jane-romix-basic.h
Normal file
74
algo/argon2/ar2/sj/scrypt-jane-romix-basic.h
Normal file
@@ -0,0 +1,74 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
/* function type returned by scrypt_getROMix, used with cpu detection */
|
||||
typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
|
||||
#endif
|
||||
|
||||
/* romix pre/post nop function */
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
(void)blocks; (void)nblocks;
|
||||
}
|
||||
|
||||
/* romix pre/post endian conversion function */
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
#if !defined(CPU_LE)
|
||||
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
|
||||
size_t i;
|
||||
if (endian_test.w == 0x100) {
|
||||
nblocks *= SCRYPT_BLOCK_WORDS;
|
||||
for (i = 0; i < nblocks; i++) {
|
||||
SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)blocks; (void)nblocks;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* chunkmix test function */
|
||||
typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
|
||||
typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
|
||||
|
||||
static int
|
||||
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
|
||||
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
|
||||
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
|
||||
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
|
||||
scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
#else
|
||||
scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
#endif
|
||||
uint8_t final[16];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < words; i++) {
|
||||
v = (scrypt_mix_word_t)i;
|
||||
v = (v << 8) | v;
|
||||
v = (v << 16) | v;
|
||||
chunk[0][i] = v;
|
||||
}
|
||||
|
||||
prefn(chunk[0], blocks);
|
||||
mixfn(chunk[1], chunk[0], NULL, r);
|
||||
postfn(chunk[1], blocks);
|
||||
|
||||
/* grab the last 16 bytes of the final block */
|
||||
for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
|
||||
SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
|
||||
}
|
||||
|
||||
return scrypt_verify(expected, final, 16);
|
||||
}
|
||||
|
||||
/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
|
||||
return base + (i * len);
|
||||
}
|
||||
|
||||
/* returns a pointer to block i */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
|
||||
return base + (i * SCRYPT_BLOCK_WORDS);
|
||||
}
|
122
algo/argon2/ar2/sj/scrypt-jane-romix-template.h
Normal file
122
algo/argon2/ar2/sj/scrypt-jane-romix-template.h
Normal file
@@ -0,0 +1,122 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
|
||||
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix
|
||||
#endif
|
||||
|
||||
#undef SCRYPT_HAVE_ROMIX
|
||||
#define SCRYPT_HAVE_ROMIX
|
||||
|
||||
#if !defined(SCRYPT_CHUNKMIX_FN)
|
||||
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
|
||||
|
||||
/*
|
||||
Bout = ChunkMix(Bin)
|
||||
|
||||
2*r: number of blocks in the chunk
|
||||
*/
|
||||
static void asm_calling_convention
|
||||
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
|
||||
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
|
||||
scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
|
||||
#else
|
||||
scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
|
||||
#endif
|
||||
uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
block = scrypt_block(Bin, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] = block[i];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] ^= block[i];
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
block = scrypt_block(Bin, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
}
|
||||
SCRYPT_MIX_FN(X);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
block = scrypt_block(Bout, (i / 2) + half);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
block[j] = X[j];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
X = ROMix(X)
|
||||
|
||||
X: chunk to mix
|
||||
Y: scratch chunk
|
||||
N: number of rounds
|
||||
V[N]: array of chunks to randomly index in to
|
||||
2*r: number of blocks in a chunk
|
||||
*/
|
||||
|
||||
static void NOINLINE FASTCALL
|
||||
SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
|
||||
uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2);
|
||||
scrypt_mix_word_t *block = V;
|
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, 2);
|
||||
|
||||
/* 1: X = B */
|
||||
/* implicit */
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
|
||||
for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) {
|
||||
/* 3: V_i = X */
|
||||
/* 4: X = H(X) */
|
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1);
|
||||
}
|
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, 1);
|
||||
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < /*N*/512; i += 2) {
|
||||
/* 7: j = Integerify(X) % N */
|
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1);
|
||||
|
||||
/* 7: j = Integerify(Y) % N */
|
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1);
|
||||
}
|
||||
|
||||
/* 10: B' = X */
|
||||
/* implicit */
|
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, 2);
|
||||
}
|
||||
|
||||
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
|
||||
|
||||
|
||||
#undef SCRYPT_CHUNKMIX_FN
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#undef SCRYPT_MIX_FN
|
||||
#undef SCRYPT_ROMIX_TANGLE_FN
|
||||
#undef SCRYPT_ROMIX_UNTANGLE_FN
|
||||
|
23
algo/argon2/ar2/sj/scrypt-jane-romix.h
Normal file
23
algo/argon2/ar2/sj/scrypt-jane-romix.h
Normal file
@@ -0,0 +1,23 @@
|
||||
#if defined(SCRYPT_SALSA64)
|
||||
#include "scrypt-jane-salsa64.h"
|
||||
#else
|
||||
#define SCRYPT_MIX_BASE "ERROR"
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
|
||||
static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; }
|
||||
#else
|
||||
static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
|
||||
#endif
|
||||
static int scrypt_test_mix(void) { return 0; }
|
||||
#error must define a mix function!
|
||||
#endif
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX SCRYPT_MIX_BASE
|
||||
#endif
|
183
algo/argon2/ar2/sj/scrypt-jane-salsa64.h
Normal file
183
algo/argon2/ar2/sj/scrypt-jane-salsa64.h
Normal file
@@ -0,0 +1,183 @@
|
||||
#define SCRYPT_MIX_BASE "Salsa64/8"
|
||||
|
||||
typedef uint64_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U64TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 128
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa64-avx2.h"
|
||||
#include "scrypt-jane-mix_salsa64-xop.h"
|
||||
#include "scrypt-jane-mix_salsa64-avx.h"
|
||||
#include "scrypt-jane-mix_salsa64-ssse3.h"
|
||||
#include "scrypt-jane-mix_salsa64-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa64.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa64_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
return scrypt_ROMix_avx2;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
return scrypt_ROMix_xop;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
return scrypt_ROMix_ssse3;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
flags |= cpu_avx2;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
flags |= cpu_xop;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
flags |= cpu_ssse3;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
scrypt_test_mix(void) {
|
||||
static const uint8_t expected[16] = {
|
||||
0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
28
algo/argon2/ar2/sj/scrypt-jane-test-vectors.h
Normal file
28
algo/argon2/ar2/sj/scrypt-jane-test-vectors.h
Normal file
@@ -0,0 +1,28 @@
|
||||
typedef struct scrypt_test_setting_t {
|
||||
const char *pw, *salt;
|
||||
uint8_t Nfactor, rfactor, pfactor;
|
||||
} scrypt_test_setting;
|
||||
|
||||
static const scrypt_test_setting post_settings[] = {
|
||||
{"", "", 3, 0, 0},
|
||||
{"password", "NaCl", 9, 3, 4},
|
||||
{0, 0, 0, 0, 0}
|
||||
};
|
||||
|
||||
#if defined(SCRYPT_SKEIN512)
|
||||
#if defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
|
||||
0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
|
||||
0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
|
||||
0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
|
||||
{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
|
||||
0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
|
||||
0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
|
||||
0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
|
||||
};
|
||||
#endif
|
||||
#else
|
||||
static const uint8_t post_vectors[][64] = {{0}};
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user