mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Initial upload v3.4.7
This commit is contained in:
0
algo/scryptjane/.dirstamp
Normal file
0
algo/scryptjane/.dirstamp
Normal file
28
algo/scryptjane/scrypt-conf.h
Normal file
28
algo/scryptjane/scrypt-conf.h
Normal file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
pick the best algo at runtime or compile time?
|
||||
----------------------------------------------
|
||||
SCRYPT_CHOOSE_COMPILETIME (gcc only!)
|
||||
SCRYPT_CHOOSE_RUNTIME
|
||||
*/
|
||||
#define SCRYPT_CHOOSE_RUNTIME
|
||||
|
||||
|
||||
/*
|
||||
hash function to use
|
||||
-------------------------------
|
||||
SCRYPT_BLAKE256
|
||||
SCRYPT_BLAKE512
|
||||
SCRYPT_SHA256
|
||||
SCRYPT_SHA512
|
||||
SCRYPT_SKEIN512
|
||||
*/
|
||||
//#define SCRYPT_SHA256
|
||||
|
||||
|
||||
/*
|
||||
block mixer to use
|
||||
-----------------------------
|
||||
SCRYPT_CHACHA
|
||||
SCRYPT_SALSA
|
||||
*/
|
||||
//#define SCRYPT_SALSA
|
||||
148
algo/scryptjane/scrypt-jane-chacha.h
Normal file
148
algo/scryptjane/scrypt-jane-chacha.h
Normal file
@@ -0,0 +1,148 @@
|
||||
#define SCRYPT_MIX_BASE "ChaCha20/8"
|
||||
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_chacha-avx.h"
|
||||
#include "scrypt-jane-mix_chacha-ssse3.h"
|
||||
#include "scrypt-jane-mix_chacha-sse2.h"
|
||||
#include "scrypt-jane-mix_chacha.h"
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#if defined(X86_INTRINSIC_AVX)
|
||||
#define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_avx_1
|
||||
#define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_avx_1_xor
|
||||
#endif
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_MIX_FN chacha_core_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_ssse3_1
|
||||
#define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_ssse3_1_xor
|
||||
#endif
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
|
||||
#define SCRYPT_MIX_FN chacha_core_ssse3
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_sse2_1
|
||||
#define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_sse2_1_xor
|
||||
#endif
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_MIX_FN chacha_core_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN chacha_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
return scrypt_ROMix_ssse3;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
flags |= cpu_ssse3;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
scrypt_test_mix() {
|
||||
static const uint8_t expected[16] = {
|
||||
0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
48
algo/scryptjane/scrypt-jane-hash.h
Normal file
48
algo/scryptjane/scrypt-jane-hash.h
Normal file
@@ -0,0 +1,48 @@
|
||||
#if defined(SCRYPT_BLAKE512)
|
||||
#include "scrypt-jane-hash_blake512.h"
|
||||
#elif defined(SCRYPT_BLAKE256)
|
||||
#include "scrypt-jane-hash_blake256.h"
|
||||
#elif defined(SCRYPT_SHA512)
|
||||
#include "scrypt-jane-hash_sha512.h"
|
||||
#elif defined(SCRYPT_SHA256)
|
||||
#include "scrypt-jane-hash_sha256.h"
|
||||
#elif defined(SCRYPT_SKEIN512)
|
||||
#include "scrypt-jane-hash_skein512.h"
|
||||
#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256)
|
||||
#include "scrypt-jane-hash_keccak.h"
|
||||
#else
|
||||
#define SCRYPT_HASH "ERROR"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
static void scrypt_hash_init(scrypt_hash_state *S) {}
|
||||
static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
|
||||
static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
|
||||
#error must define a hash function!
|
||||
#endif
|
||||
|
||||
#include "scrypt-jane-pbkdf2.h"
|
||||
|
||||
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
|
||||
|
||||
static int
|
||||
scrypt_test_hash() {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_digest hash, final;
|
||||
uint8_t msg[SCRYPT_TEST_HASH_LEN];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
|
||||
msg[i] = (uint8_t)i;
|
||||
|
||||
scrypt_hash_init(&st);
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
|
||||
scrypt_hash(hash, msg, i);
|
||||
scrypt_hash_update(&st, hash, sizeof(hash));
|
||||
}
|
||||
scrypt_hash_finish(&st, final);
|
||||
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
|
||||
}
|
||||
|
||||
177
algo/scryptjane/scrypt-jane-hash_blake256.h
Normal file
177
algo/scryptjane/scrypt-jane-hash_blake256.h
Normal file
@@ -0,0 +1,177 @@
|
||||
#define SCRYPT_HASH "BLAKE-256"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
const uint8_t blake256_sigma[] = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
|
||||
11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
|
||||
7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
|
||||
9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
|
||||
2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
|
||||
12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
|
||||
13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
|
||||
6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
|
||||
10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
|
||||
};
|
||||
|
||||
const uint32_t blake256_constants[16] = {
|
||||
0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
|
||||
0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
|
||||
};
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint32_t H[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static void
|
||||
blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16);
|
||||
uint32_t m[16], v[16], h[8], t[2];
|
||||
uint32_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] = S->H[i];
|
||||
for (i = 0; i < 2; i++) t[i] = S->T[i];
|
||||
|
||||
while (blocks--) {
|
||||
t[0] += 512;
|
||||
t[1] += (t[0] < 512) ? 1 : 0;
|
||||
|
||||
for (i = 0; i < 8; i++) v[i ] = h[i];
|
||||
for (i = 0; i < 4; i++) v[i + 8] = blake256_constants[i];
|
||||
for (i = 0; i < 2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0];
|
||||
for (i = 0; i < 2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1];
|
||||
|
||||
for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]);
|
||||
in += 64;
|
||||
|
||||
#define G(a,b,c,d,e) \
|
||||
v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \
|
||||
v[d] = ROTR32(v[d] ^ v[a],16); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR32(v[b] ^ v[c],12); \
|
||||
v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \
|
||||
v[d] = ROTR32(v[d] ^ v[a], 8); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR32(v[b] ^ v[c], 7);
|
||||
|
||||
for (i = 0, sigma = blake256_sigma; i < 14; i++) {
|
||||
G(0, 4, 8,12, 0);
|
||||
G(1, 5, 9,13, 2);
|
||||
G(2, 6,10,14, 4);
|
||||
G(3, 7,11,15, 6);
|
||||
|
||||
G(0, 5,10,15, 8);
|
||||
G(1, 6,11,12,10);
|
||||
G(2, 7, 8,13,12);
|
||||
G(3, 4, 9,14,14);
|
||||
|
||||
sigma += 16;
|
||||
if (sigma == sigma_end)
|
||||
sigma = blake256_sigma;
|
||||
}
|
||||
|
||||
#undef G
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++) S->H[i] = h[i];
|
||||
for (i = 0; i < 2; i++) S->T[i] = t[i];
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667ULL;
|
||||
S->H[1] = 0xbb67ae85ULL;
|
||||
S->H[2] = 0x3c6ef372ULL;
|
||||
S->H[3] = 0xa54ff53aULL;
|
||||
S->H[4] = 0x510e527fULL;
|
||||
S->H[5] = 0x9b05688cULL;
|
||||
S->H[6] = 0x1f83d9abULL;
|
||||
S->H[7] = 0x5be0cd19ULL;
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
blake256_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint32_t th, tl, bits;
|
||||
|
||||
bits = (S->leftover << 3);
|
||||
tl = S->T[0] + bits;
|
||||
th = S->T[1];
|
||||
if (S->leftover == 0) {
|
||||
S->T[0] = (uint32_t)0 - (uint32_t)512;
|
||||
S->T[1] = (uint32_t)0 - (uint32_t)1;
|
||||
} else if (S->T[0] == 0) {
|
||||
S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits;
|
||||
S->T[1] = S->T[1] - 1;
|
||||
} else {
|
||||
S->T[0] -= (512 - bits);
|
||||
}
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 55) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
|
||||
blake256_blocks(S, S->buffer, 1);
|
||||
S->T[0] = (uint32_t)0 - (uint32_t)512;
|
||||
S->T[1] = (uint32_t)0 - (uint32_t)1;
|
||||
memset(S->buffer, 0, 56);
|
||||
}
|
||||
S->buffer[55] |= 1;
|
||||
U32TO8_BE(S->buffer + 56, th);
|
||||
U32TO8_BE(S->buffer + 60, tl);
|
||||
blake256_blocks(S, S->buffer, 1);
|
||||
|
||||
U32TO8_BE(&hash[ 0], S->H[0]);
|
||||
U32TO8_BE(&hash[ 4], S->H[1]);
|
||||
U32TO8_BE(&hash[ 8], S->H[2]);
|
||||
U32TO8_BE(&hash[12], S->H[3]);
|
||||
U32TO8_BE(&hash[16], S->H[4]);
|
||||
U32TO8_BE(&hash[20], S->H[5]);
|
||||
U32TO8_BE(&hash[24], S->H[6]);
|
||||
U32TO8_BE(&hash[28], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20,
|
||||
0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8,
|
||||
};
|
||||
181
algo/scryptjane/scrypt-jane-hash_blake512.h
Normal file
181
algo/scryptjane/scrypt-jane-hash_blake512.h
Normal file
@@ -0,0 +1,181 @@
|
||||
#define SCRYPT_HASH "BLAKE-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 128
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
const uint8_t blake512_sigma[] = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
|
||||
11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
|
||||
7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
|
||||
9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
|
||||
2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
|
||||
12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
|
||||
13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
|
||||
6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
|
||||
10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
|
||||
};
|
||||
|
||||
const uint64_t blake512_constants[16] = {
|
||||
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
|
||||
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
|
||||
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
|
||||
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
|
||||
};
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t H[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static void
|
||||
blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16);
|
||||
uint64_t m[16], v[16], h[8], t[2];
|
||||
uint32_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] = S->H[i];
|
||||
for (i = 0; i < 2; i++) t[i] = S->T[i];
|
||||
|
||||
while (blocks--) {
|
||||
t[0] += 1024;
|
||||
t[1] += (t[0] < 1024) ? 1 : 0;
|
||||
|
||||
for (i = 0; i < 8; i++) v[i ] = h[i];
|
||||
for (i = 0; i < 4; i++) v[i + 8] = blake512_constants[i];
|
||||
for (i = 0; i < 2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0];
|
||||
for (i = 0; i < 2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1];
|
||||
|
||||
for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]);
|
||||
in += 128;
|
||||
|
||||
#define G(a,b,c,d,e) \
|
||||
v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \
|
||||
v[d] = ROTR64(v[d] ^ v[a],32); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c],25); \
|
||||
v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \
|
||||
v[d] = ROTR64(v[d] ^ v[a],16); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c],11);
|
||||
|
||||
for (i = 0, sigma = blake512_sigma; i < 16; i++) {
|
||||
G(0, 4, 8,12, 0);
|
||||
G(1, 5, 9,13, 2);
|
||||
G(2, 6,10,14, 4);
|
||||
G(3, 7,11,15, 6);
|
||||
G(0, 5,10,15, 8);
|
||||
G(1, 6,11,12,10);
|
||||
G(2, 7, 8,13,12);
|
||||
G(3, 4, 9,14,14);
|
||||
|
||||
sigma += 16;
|
||||
if (sigma == sigma_end)
|
||||
sigma = blake512_sigma;
|
||||
}
|
||||
|
||||
#undef G
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++) S->H[i] = h[i];
|
||||
for (i = 0; i < 2; i++) S->T[i] = t[i];
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667f3bcc908ULL;
|
||||
S->H[1] = 0xbb67ae8584caa73bULL;
|
||||
S->H[2] = 0x3c6ef372fe94f82bULL;
|
||||
S->H[3] = 0xa54ff53a5f1d36f1ULL;
|
||||
S->H[4] = 0x510e527fade682d1ULL;
|
||||
S->H[5] = 0x9b05688c2b3e6c1fULL;
|
||||
S->H[6] = 0x1f83d9abfb41bd6bULL;
|
||||
S->H[7] = 0x5be0cd19137e2179ULL;
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
blake512_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint64_t th, tl;
|
||||
size_t bits;
|
||||
|
||||
bits = (S->leftover << 3);
|
||||
tl = S->T[0] + bits;
|
||||
th = S->T[1];
|
||||
if (S->leftover == 0) {
|
||||
S->T[0] = (uint64_t)0 - (uint64_t)1024;
|
||||
S->T[1] = (uint64_t)0 - (uint64_t)1;
|
||||
} else if (S->T[0] == 0) {
|
||||
S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits;
|
||||
S->T[1] = S->T[1] - 1;
|
||||
} else {
|
||||
S->T[0] -= (1024 - bits);
|
||||
}
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 111) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
|
||||
blake512_blocks(S, S->buffer, 1);
|
||||
S->T[0] = (uint64_t)0 - (uint64_t)1024;
|
||||
S->T[1] = (uint64_t)0 - (uint64_t)1;
|
||||
memset(S->buffer, 0, 112);
|
||||
}
|
||||
S->buffer[111] |= 1;
|
||||
U64TO8_BE(S->buffer + 112, th);
|
||||
U64TO8_BE(S->buffer + 120, tl);
|
||||
blake512_blocks(S, S->buffer, 1);
|
||||
|
||||
U64TO8_BE(&hash[ 0], S->H[0]);
|
||||
U64TO8_BE(&hash[ 8], S->H[1]);
|
||||
U64TO8_BE(&hash[16], S->H[2]);
|
||||
U64TO8_BE(&hash[24], S->H[3]);
|
||||
U64TO8_BE(&hash[32], S->H[4]);
|
||||
U64TO8_BE(&hash[40], S->H[5]);
|
||||
U64TO8_BE(&hash[48], S->H[6]);
|
||||
U64TO8_BE(&hash[56], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f,
|
||||
0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83,
|
||||
0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a,
|
||||
0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac
|
||||
};
|
||||
168
algo/scryptjane/scrypt-jane-hash_keccak.h
Normal file
168
algo/scryptjane/scrypt-jane-hash_keccak.h
Normal file
@@ -0,0 +1,168 @@
|
||||
#if defined(SCRYPT_KECCAK256)
|
||||
#define SCRYPT_HASH "Keccak-256"
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32
|
||||
#else
|
||||
#define SCRYPT_HASH "Keccak-512"
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
#endif
|
||||
#define SCRYPT_KECCAK_F 1600
|
||||
#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */
|
||||
#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */
|
||||
#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8)
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t state[SCRYPT_KECCAK_F / 64];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static const uint64_t keccak_round_constants[24] = {
|
||||
0x0000000000000001ull, 0x0000000000008082ull,
|
||||
0x800000000000808aull, 0x8000000080008000ull,
|
||||
0x000000000000808bull, 0x0000000080000001ull,
|
||||
0x8000000080008081ull, 0x8000000000008009ull,
|
||||
0x000000000000008aull, 0x0000000000000088ull,
|
||||
0x0000000080008009ull, 0x000000008000000aull,
|
||||
0x000000008000808bull, 0x800000000000008bull,
|
||||
0x8000000000008089ull, 0x8000000000008003ull,
|
||||
0x8000000000008002ull, 0x8000000000000080ull,
|
||||
0x000000000000800aull, 0x800000008000000aull,
|
||||
0x8000000080008081ull, 0x8000000000008080ull,
|
||||
0x0000000080000001ull, 0x8000000080008008ull
|
||||
};
|
||||
|
||||
static void
|
||||
keccak_block(scrypt_hash_state *S, const uint8_t *in) {
|
||||
size_t i;
|
||||
uint64_t *s = S->state, t[5], u[5], v, w;
|
||||
|
||||
/* absorb input */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
|
||||
s[i] ^= U8TO64_LE(in);
|
||||
|
||||
for (i = 0; i < 24; i++) {
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
|
||||
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
|
||||
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
|
||||
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
|
||||
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
u[0] = t[4] ^ ROTL64(t[1], 1);
|
||||
u[1] = t[0] ^ ROTL64(t[2], 1);
|
||||
u[2] = t[1] ^ ROTL64(t[3], 1);
|
||||
u[3] = t[2] ^ ROTL64(t[4], 1);
|
||||
u[4] = t[3] ^ ROTL64(t[0], 1);
|
||||
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
|
||||
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
|
||||
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
|
||||
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
|
||||
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
v = s[ 1];
|
||||
s[ 1] = ROTL64(s[ 6], 44);
|
||||
s[ 6] = ROTL64(s[ 9], 20);
|
||||
s[ 9] = ROTL64(s[22], 61);
|
||||
s[22] = ROTL64(s[14], 39);
|
||||
s[14] = ROTL64(s[20], 18);
|
||||
s[20] = ROTL64(s[ 2], 62);
|
||||
s[ 2] = ROTL64(s[12], 43);
|
||||
s[12] = ROTL64(s[13], 25);
|
||||
s[13] = ROTL64(s[19], 8);
|
||||
s[19] = ROTL64(s[23], 56);
|
||||
s[23] = ROTL64(s[15], 41);
|
||||
s[15] = ROTL64(s[ 4], 27);
|
||||
s[ 4] = ROTL64(s[24], 14);
|
||||
s[24] = ROTL64(s[21], 2);
|
||||
s[21] = ROTL64(s[ 8], 55);
|
||||
s[ 8] = ROTL64(s[16], 45);
|
||||
s[16] = ROTL64(s[ 5], 36);
|
||||
s[ 5] = ROTL64(s[ 3], 28);
|
||||
s[ 3] = ROTL64(s[18], 21);
|
||||
s[18] = ROTL64(s[17], 15);
|
||||
s[17] = ROTL64(s[11], 10);
|
||||
s[11] = ROTL64(s[ 7], 6);
|
||||
s[ 7] = ROTL64(s[10], 3);
|
||||
s[10] = ROTL64( v, 1);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
|
||||
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
|
||||
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
|
||||
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
|
||||
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= keccak_round_constants[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
memset(S, 0, sizeof(*S));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
keccak_block(S, S->buffer);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
while (inlen >= SCRYPT_HASH_BLOCK_SIZE) {
|
||||
keccak_block(S, in);
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
inlen -= SCRYPT_HASH_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
S->leftover = (uint32_t)inlen;
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
size_t i;
|
||||
|
||||
S->buffer[S->leftover] = 0x01;
|
||||
memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1));
|
||||
S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80;
|
||||
keccak_block(S, S->buffer);
|
||||
|
||||
for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) {
|
||||
U64TO8_LE(&hash[i], S->state[i / 8]);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(SCRYPT_KECCAK256)
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19,
|
||||
0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea,
|
||||
};
|
||||
#else
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88,
|
||||
0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12,
|
||||
0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0,
|
||||
0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6,
|
||||
};
|
||||
#endif
|
||||
|
||||
135
algo/scryptjane/scrypt-jane-hash_sha256.h
Normal file
135
algo/scryptjane/scrypt-jane-hash_sha256.h
Normal file
@@ -0,0 +1,135 @@
|
||||
#define SCRYPT_HASH "SHA-2-256"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint32_t H[8];
|
||||
uint64_t T;
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static const uint32_t sha256_constants[64] = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
|
||||
};
|
||||
|
||||
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
|
||||
#define Maj(x,y,z) (((x | y) & z) | (x & y))
|
||||
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
|
||||
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
|
||||
#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3))
|
||||
#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
|
||||
#define W0(in,i) (U8TO32_BE(&in[i * 4]))
|
||||
#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
|
||||
#define STEP(i) \
|
||||
t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
|
||||
t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
|
||||
r[7] = r[6]; \
|
||||
r[6] = r[5]; \
|
||||
r[5] = r[4]; \
|
||||
r[4] = r[3] + t0; \
|
||||
r[3] = r[2]; \
|
||||
r[2] = r[1]; \
|
||||
r[1] = r[0]; \
|
||||
r[0] = t0 + t1;
|
||||
|
||||
static void
|
||||
sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
uint32_t r[8], w[64], t0, t1;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) r[i] = S->H[i];
|
||||
|
||||
while (blocks--) {
|
||||
for (i = 0; i < 16; i++) { w[i] = W0(in, i); }
|
||||
for (i = 16; i < 64; i++) { w[i] = W1(i); }
|
||||
for (i = 0; i < 64; i++) { STEP(i); }
|
||||
for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
|
||||
S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667;
|
||||
S->H[1] = 0xbb67ae85;
|
||||
S->H[2] = 0x3c6ef372;
|
||||
S->H[3] = 0xa54ff53a;
|
||||
S->H[4] = 0x510e527f;
|
||||
S->H[5] = 0x9b05688c;
|
||||
S->H[6] = 0x1f83d9ab;
|
||||
S->H[7] = 0x5be0cd19;
|
||||
S->T = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
sha256_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint64_t t = S->T + (S->leftover * 8);
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 55) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
|
||||
sha256_blocks(S, S->buffer, 1);
|
||||
memset(S->buffer, 0, 56);
|
||||
}
|
||||
|
||||
U64TO8_BE(S->buffer + 56, t);
|
||||
sha256_blocks(S, S->buffer, 1);
|
||||
|
||||
U32TO8_BE(&hash[ 0], S->H[0]);
|
||||
U32TO8_BE(&hash[ 4], S->H[1]);
|
||||
U32TO8_BE(&hash[ 8], S->H[2]);
|
||||
U32TO8_BE(&hash[12], S->H[3]);
|
||||
U32TO8_BE(&hash[16], S->H[4]);
|
||||
U32TO8_BE(&hash[20], S->H[5]);
|
||||
U32TO8_BE(&hash[24], S->H[6]);
|
||||
U32TO8_BE(&hash[28], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf,
|
||||
0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad,
|
||||
};
|
||||
152
algo/scryptjane/scrypt-jane-hash_sha512.h
Normal file
152
algo/scryptjane/scrypt-jane-hash_sha512.h
Normal file
@@ -0,0 +1,152 @@
|
||||
#define SCRYPT_HASH "SHA-2-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 128
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t H[8];
|
||||
uint64_t T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static const uint64_t sha512_constants[80] = {
|
||||
0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull,
|
||||
0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull,
|
||||
0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull,
|
||||
0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull,
|
||||
0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull,
|
||||
0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull,
|
||||
0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull,
|
||||
0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull,
|
||||
0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull,
|
||||
0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull,
|
||||
0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull,
|
||||
0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull,
|
||||
0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull,
|
||||
0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull,
|
||||
0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull,
|
||||
0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull,
|
||||
0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull,
|
||||
0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull,
|
||||
0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull,
|
||||
0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull
|
||||
};
|
||||
|
||||
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
|
||||
#define Maj(x,y,z) (((x | y) & z) | (x & y))
|
||||
#define S0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
|
||||
#define S1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
|
||||
#define G0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ (x >> 7))
|
||||
#define G1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >> 6))
|
||||
#define W0(in,i) (U8TO64_BE(&in[i * 8]))
|
||||
#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
|
||||
#define STEP(i) \
|
||||
t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
|
||||
t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \
|
||||
r[7] = r[6]; \
|
||||
r[6] = r[5]; \
|
||||
r[5] = r[4]; \
|
||||
r[4] = r[3] + t0; \
|
||||
r[3] = r[2]; \
|
||||
r[2] = r[1]; \
|
||||
r[1] = r[0]; \
|
||||
r[0] = t0 + t1;
|
||||
|
||||
static void
|
||||
sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
uint64_t r[8], w[80], t0, t1;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) r[i] = S->H[i];
|
||||
|
||||
while (blocks--) {
|
||||
for (i = 0; i < 16; i++) { w[i] = W0(in, i); }
|
||||
for (i = 16; i < 80; i++) { w[i] = W1(i); }
|
||||
for (i = 0; i < 80; i++) { STEP(i); }
|
||||
for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
|
||||
S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8;
|
||||
S->T[1] += (!S->T[0]) ? 1 : 0;
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667f3bcc908ull;
|
||||
S->H[1] = 0xbb67ae8584caa73bull;
|
||||
S->H[2] = 0x3c6ef372fe94f82bull;
|
||||
S->H[3] = 0xa54ff53a5f1d36f1ull;
|
||||
S->H[4] = 0x510e527fade682d1ull;
|
||||
S->H[5] = 0x9b05688c2b3e6c1full;
|
||||
S->H[6] = 0x1f83d9abfb41bd6bull;
|
||||
S->H[7] = 0x5be0cd19137e2179ull;
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
sha512_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1];
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 111) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
|
||||
sha512_blocks(S, S->buffer, 1);
|
||||
memset(S->buffer, 0, 112);
|
||||
}
|
||||
|
||||
U64TO8_BE(S->buffer + 112, t1);
|
||||
U64TO8_BE(S->buffer + 120, t0);
|
||||
sha512_blocks(S, S->buffer, 1);
|
||||
|
||||
U64TO8_BE(&hash[ 0], S->H[0]);
|
||||
U64TO8_BE(&hash[ 8], S->H[1]);
|
||||
U64TO8_BE(&hash[16], S->H[2]);
|
||||
U64TO8_BE(&hash[24], S->H[3]);
|
||||
U64TO8_BE(&hash[32], S->H[4]);
|
||||
U64TO8_BE(&hash[40], S->H[5]);
|
||||
U64TO8_BE(&hash[48], S->H[6]);
|
||||
U64TO8_BE(&hash[56], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d,
|
||||
0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec,
|
||||
0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c,
|
||||
0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c,
|
||||
};
|
||||
188
algo/scryptjane/scrypt-jane-hash_skein512.h
Normal file
188
algo/scryptjane/scrypt-jane-hash_skein512.h
Normal file
@@ -0,0 +1,188 @@
|
||||
#define SCRYPT_HASH "Skein-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t X[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static void
|
||||
skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
|
||||
uint64_t X[8], key[8], Xt[9+18], T[3+1];
|
||||
size_t r;
|
||||
|
||||
while (blocks--) {
|
||||
T[0] = S->T[0] + add;
|
||||
T[1] = S->T[1];
|
||||
T[2] = T[0] ^ T[1];
|
||||
key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
|
||||
key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
|
||||
key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
|
||||
key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
|
||||
key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
|
||||
key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
|
||||
key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
|
||||
key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
|
||||
Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
|
||||
for (r = 0; r < 18; r++)
|
||||
Xt[r + 9] = Xt[r + 0];
|
||||
|
||||
for (r = 0; r < 18; r += 2) {
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 1];
|
||||
X[1] += Xt[r + 2];
|
||||
X[2] += Xt[r + 3];
|
||||
X[3] += Xt[r + 4];
|
||||
X[4] += Xt[r + 5];
|
||||
X[5] += Xt[r + 6] + T[1];
|
||||
X[6] += Xt[r + 7] + T[2];
|
||||
X[7] += Xt[r + 8] + r + 1;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 2];
|
||||
X[1] += Xt[r + 3];
|
||||
X[2] += Xt[r + 4];
|
||||
X[3] += Xt[r + 5];
|
||||
X[4] += Xt[r + 6];
|
||||
X[5] += Xt[r + 7] + T[1];
|
||||
X[6] += Xt[r + 8] + T[2];
|
||||
X[7] += Xt[r + 9] + r + 2;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
}
|
||||
|
||||
S->X[0] = key[0] ^ X[0];
|
||||
S->X[1] = key[1] ^ X[1];
|
||||
S->X[2] = key[2] ^ X[2];
|
||||
S->X[3] = key[3] ^ X[3];
|
||||
S->X[4] = key[4] ^ X[4];
|
||||
S->X[5] = key[5] ^ X[5];
|
||||
S->X[6] = key[6] ^ X[6];
|
||||
S->X[7] = key[7] ^ X[7];
|
||||
|
||||
S->T[0] = T[0];
|
||||
S->T[1] = T[1] & ~0x4000000000000000ull;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->X[0] = 0x4903ADFF749C51CEull;
|
||||
S->X[1] = 0x0D95DE399746DF03ull;
|
||||
S->X[2] = 0x8FD1934127C79BCEull;
|
||||
S->X[3] = 0x9A255629FF352CB1ull;
|
||||
S->X[4] = 0x5DB62599DF6CA7B0ull;
|
||||
S->X[5] = 0xEABE394CA9D5C3F4ull;
|
||||
S->X[6] = 0x991112C71A75B523ull;
|
||||
S->X[7] = 0xAE18A40B660FCC33ull;
|
||||
S->T[0] = 0x0000000000000000ull;
|
||||
S->T[1] = 0x7000000000000000ull;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
|
||||
if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* handle the previous data, we know there is enough for at least one block */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
in += want;
|
||||
inlen -= want;
|
||||
S->leftover = 0;
|
||||
skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/* handle the current data if there's more than one block */
|
||||
if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
|
||||
inlen -= blocks;
|
||||
in += blocks;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
memcpy(S->buffer + S->leftover, in, inlen);
|
||||
S->leftover += inlen;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
S->T[1] |= 0x8000000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, S->leftover);
|
||||
|
||||
memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0xff00000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, 8);
|
||||
|
||||
U64TO8_LE(&hash[ 0], S->X[0]);
|
||||
U64TO8_LE(&hash[ 8], S->X[1]);
|
||||
U64TO8_LE(&hash[16], S->X[2]);
|
||||
U64TO8_LE(&hash[24], S->X[3]);
|
||||
U64TO8_LE(&hash[32], S->X[4]);
|
||||
U64TO8_LE(&hash[40], S->X[5]);
|
||||
U64TO8_LE(&hash[48], S->X[6]);
|
||||
U64TO8_LE(&hash[56], S->X[7]);
|
||||
}
|
||||
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
|
||||
0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
|
||||
0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
|
||||
0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
|
||||
};
|
||||
564
algo/scryptjane/scrypt-jane-mix_chacha-avx.h
Normal file
564
algo/scryptjane/scrypt-jane-mix_chacha-avx.h
Normal file
@@ -0,0 +1,564 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,64)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(mov ebx, 0x01000302)
|
||||
a2(vmovd xmm4, ebx)
|
||||
a2(mov ebx, 0x05040706)
|
||||
a2(vmovd xmm0, ebx)
|
||||
a2(mov ebx, 0x09080b0a)
|
||||
a2(vmovd xmm1, ebx)
|
||||
a2(mov ebx, 0x0d0c0f0e)
|
||||
a2(vmovd xmm2, ebx)
|
||||
a2(mov ebx, 0x02010003)
|
||||
a2(vmovd xmm5, ebx)
|
||||
a2(mov ebx, 0x06050407)
|
||||
a2(vmovd xmm3, ebx)
|
||||
a2(mov ebx, 0x0a09080b)
|
||||
a2(vmovd xmm6, ebx)
|
||||
a2(mov ebx, 0x0e0d0c0f)
|
||||
a2(vmovd xmm7, ebx)
|
||||
a3(vpunpckldq xmm4, xmm4, xmm0)
|
||||
a3(vpunpckldq xmm5, xmm5, xmm3)
|
||||
a3(vpunpckldq xmm1, xmm1, xmm2)
|
||||
a3(vpunpckldq xmm6, xmm6, xmm7)
|
||||
a3(vpunpcklqdq xmm4, xmm4, xmm1)
|
||||
a3(vpunpcklqdq xmm5, xmm5, xmm6)
|
||||
a2(vmovdqa xmm0,[ecx+esi+0])
|
||||
a2(vmovdqa xmm1,[ecx+esi+16])
|
||||
a2(vmovdqa xmm2,[ecx+esi+32])
|
||||
a2(vmovdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[ecx+eax+0])
|
||||
a3(vpxor xmm1,xmm1,[ecx+eax+16])
|
||||
a3(vpxor xmm2,xmm2,[ecx+eax+32])
|
||||
a3(vpxor xmm3,xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and eax, eax)
|
||||
a3(vpxor xmm0,xmm0,[esi+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[esi+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[esi+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[eax+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[eax+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[eax+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [esp+0],xmm0)
|
||||
a2(vmovdqa [esp+16],xmm1)
|
||||
a2(vmovdqa [esp+32],xmm2)
|
||||
a2(vmovdqa [esp+48],xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_chacha_avx_loop: )
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm6,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x93)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpshufd xmm2,xmm2,0x39)
|
||||
a3(vpsrld xmm6,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a2(sub eax,2)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm6,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x39)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a3(vpsrld xmm6,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,[esp+0])
|
||||
a3(vpaddd xmm1,xmm1,[esp+16])
|
||||
a3(vpaddd xmm2,xmm2,[esp+32])
|
||||
a3(vpaddd xmm3,xmm3,[esp+48])
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(vmovdqa [eax+0],xmm0)
|
||||
a2(vmovdqa [eax+16],xmm1)
|
||||
a2(vmovdqa [eax+32],xmm2)
|
||||
a2(vmovdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(mov r8, 0x0504070601000302)
|
||||
a2(mov rax, 0x0d0c0f0e09080b0a)
|
||||
a2(movq xmm4, r8)
|
||||
a2(movq xmm6, rax)
|
||||
a2(mov r8, 0x0605040702010003)
|
||||
a2(mov rax, 0x0e0d0c0f0a09080b)
|
||||
a2(movq xmm5, r8)
|
||||
a2(movq xmm7, rax)
|
||||
a3(vpunpcklqdq xmm4, xmm4, xmm6)
|
||||
a3(vpunpcklqdq xmm5, xmm5, xmm7)
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r8,r8)
|
||||
a2(xor r9,r9)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa xmm8,xmm0)
|
||||
a2(vmovdqa xmm9,xmm1)
|
||||
a2(vmovdqa xmm10,xmm2)
|
||||
a2(vmovdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_chacha_avx_loop: )
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm12,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x93)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpshufd xmm2,xmm2,0x39)
|
||||
a3(vpsrld xmm12,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a2(sub rax,2)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm12,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x39)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a3(vpsrld xmm12,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a1(ja scrypt_chacha_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,xmm8)
|
||||
a3(vpaddd xmm1,xmm1,xmm9)
|
||||
a3(vpaddd xmm2,xmm2,xmm10)
|
||||
a3(vpaddd xmm3,xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_AVX
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and no XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and unconditional XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha/8-AVX"
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#endif
|
||||
585
algo/scryptjane/scrypt-jane-mix_chacha-sse2.h
Normal file
585
algo/scryptjane/scrypt-jane-mix_chacha-sse2.h
Normal file
@@ -0,0 +1,585 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,16)
|
||||
a2(and esp,~15)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[ecx+eax+0])
|
||||
a2(pxor xmm1,[ecx+eax+16])
|
||||
a2(pxor xmm2,[ecx+eax+32])
|
||||
a2(pxor xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and eax, eax)
|
||||
a2(pxor xmm0,[esi+ecx+0])
|
||||
a2(pxor xmm1,[esi+ecx+16])
|
||||
a2(pxor xmm2,[esi+ecx+32])
|
||||
a2(pxor xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[eax+ecx+0])
|
||||
a2(pxor xmm1,[eax+ecx+16])
|
||||
a2(pxor xmm2,[eax+ecx+32])
|
||||
a2(pxor xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [esp+0],xmm0)
|
||||
a2(movdqa xmm4,xmm1)
|
||||
a2(movdqa xmm5,xmm2)
|
||||
a2(movdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_chacha_sse2_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(sub eax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_sse2_loop)
|
||||
a2(paddd xmm0,[esp+0])
|
||||
a2(paddd xmm1,xmm4)
|
||||
a2(paddd xmm2,xmm5)
|
||||
a2(paddd xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(movdqa [eax+0],xmm0)
|
||||
a2(movdqa [eax+16],xmm1)
|
||||
a2(movdqa [eax+32],xmm2)
|
||||
a2(movdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa xmm8,xmm0)
|
||||
a2(movdqa xmm9,xmm1)
|
||||
a2(movdqa xmm10,xmm2)
|
||||
a2(movdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_chacha_sse2_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(sub rax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_sse2_loop)
|
||||
a2(paddd xmm0,xmm8)
|
||||
a2(paddd xmm1,xmm9)
|
||||
a2(paddd xmm2,xmm10)
|
||||
a2(paddd xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSE2
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and no XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and unconditional XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha/8-SSE2"
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#endif
|
||||
572
algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h
Normal file
572
algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h
Normal file
@@ -0,0 +1,572 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,64)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(mov ebx, 0x01000302)
|
||||
a2(movd xmm4, ebx)
|
||||
a2(mov ebx, 0x05040706)
|
||||
a2(movd xmm0, ebx)
|
||||
a2(mov ebx, 0x09080b0a)
|
||||
a2(movd xmm1, ebx)
|
||||
a2(mov ebx, 0x0d0c0f0e)
|
||||
a2(movd xmm2, ebx)
|
||||
a2(mov ebx, 0x02010003)
|
||||
a2(movd xmm5, ebx)
|
||||
a2(mov ebx, 0x06050407)
|
||||
a2(movd xmm3, ebx)
|
||||
a2(mov ebx, 0x0a09080b)
|
||||
a2(movd xmm6, ebx)
|
||||
a2(mov ebx, 0x0e0d0c0f)
|
||||
a2(movd xmm7, ebx)
|
||||
a2(punpckldq xmm4, xmm0)
|
||||
a2(punpckldq xmm5, xmm3)
|
||||
a2(punpckldq xmm1, xmm2)
|
||||
a2(punpckldq xmm6, xmm7)
|
||||
a2(punpcklqdq xmm4, xmm1)
|
||||
a2(punpcklqdq xmm5, xmm6)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[ecx+eax+0])
|
||||
a2(pxor xmm1,[ecx+eax+16])
|
||||
a2(pxor xmm2,[ecx+eax+32])
|
||||
a2(pxor xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and eax, eax)
|
||||
a2(pxor xmm0,[esi+ecx+0])
|
||||
a2(pxor xmm1,[esi+ecx+16])
|
||||
a2(pxor xmm2,[esi+ecx+32])
|
||||
a2(pxor xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[eax+ecx+0])
|
||||
a2(pxor xmm1,[eax+ecx+16])
|
||||
a2(pxor xmm2,[eax+ecx+32])
|
||||
a2(pxor xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa [esp+0],xmm0)
|
||||
a2(movdqa [esp+16],xmm1)
|
||||
a2(movdqa [esp+32],xmm2)
|
||||
a2(movdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_chacha_ssse3_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(sub eax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_ssse3_loop)
|
||||
a2(paddd xmm0,[esp+0])
|
||||
a2(paddd xmm1,[esp+16])
|
||||
a2(paddd xmm2,[esp+32])
|
||||
a2(paddd xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(movdqa [eax+0],xmm0)
|
||||
a2(movdqa [eax+16],xmm1)
|
||||
a2(movdqa [eax+32],xmm2)
|
||||
a2(movdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(mov r8, 0x0504070601000302)
|
||||
a2(mov rax, 0x0d0c0f0e09080b0a)
|
||||
a2(movq xmm4, r8)
|
||||
a2(movq xmm6, rax)
|
||||
a2(mov r8, 0x0605040702010003)
|
||||
a2(mov rax, 0x0e0d0c0f0a09080b)
|
||||
a2(movq xmm5, r8)
|
||||
a2(movq xmm7, rax)
|
||||
a2(punpcklqdq xmm4, xmm6)
|
||||
a2(punpcklqdq xmm5, xmm7)
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor r8,r8)
|
||||
a2(xor r9,r9)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa xmm8,xmm0)
|
||||
a2(movdqa xmm9,xmm1)
|
||||
a2(movdqa xmm10,xmm2)
|
||||
a2(movdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_chacha_ssse3_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm12,20)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm12,25)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a2(sub rax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm12,20)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm12,25)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a1(ja scrypt_chacha_ssse3_loop)
|
||||
a2(paddd xmm0,xmm8)
|
||||
a2(paddd xmm1,xmm9)
|
||||
a2(paddd xmm2,xmm10)
|
||||
a2(paddd xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSSE3
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and no XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and unconditional XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha/8-SSSE3"
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#endif
|
||||
69
algo/scryptjane/scrypt-jane-mix_chacha.h
Normal file
69
algo/scryptjane/scrypt-jane-mix_chacha.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha20/8 Ref"
|
||||
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_BASIC
|
||||
|
||||
static void
|
||||
chacha_core_basic(uint32_t state[16]) {
|
||||
size_t rounds = 8;
|
||||
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
|
||||
|
||||
x0 = state[0];
|
||||
x1 = state[1];
|
||||
x2 = state[2];
|
||||
x3 = state[3];
|
||||
x4 = state[4];
|
||||
x5 = state[5];
|
||||
x6 = state[6];
|
||||
x7 = state[7];
|
||||
x8 = state[8];
|
||||
x9 = state[9];
|
||||
x10 = state[10];
|
||||
x11 = state[11];
|
||||
x12 = state[12];
|
||||
x13 = state[13];
|
||||
x14 = state[14];
|
||||
x15 = state[15];
|
||||
|
||||
#define quarter(a,b,c,d) \
|
||||
a += b; t = d^a; d = ROTL32(t,16); \
|
||||
c += d; t = b^c; b = ROTL32(t,12); \
|
||||
a += b; t = d^a; d = ROTL32(t, 8); \
|
||||
c += d; t = b^c; b = ROTL32(t, 7);
|
||||
|
||||
for (; rounds; rounds -= 2) {
|
||||
quarter( x0, x4, x8,x12)
|
||||
quarter( x1, x5, x9,x13)
|
||||
quarter( x2, x6,x10,x14)
|
||||
quarter( x3, x7,x11,x15)
|
||||
quarter( x0, x5,x10,x15)
|
||||
quarter( x1, x6,x11,x12)
|
||||
quarter( x2, x7, x8,x13)
|
||||
quarter( x3, x4, x9,x14)
|
||||
}
|
||||
|
||||
state[0] += x0;
|
||||
state[1] += x1;
|
||||
state[2] += x2;
|
||||
state[3] += x3;
|
||||
state[4] += x4;
|
||||
state[5] += x5;
|
||||
state[6] += x6;
|
||||
state[7] += x7;
|
||||
state[8] += x8;
|
||||
state[9] += x9;
|
||||
state[10] += x10;
|
||||
state[11] += x11;
|
||||
state[12] += x12;
|
||||
state[13] += x13;
|
||||
state[14] += x14;
|
||||
state[15] += x15;
|
||||
|
||||
#undef quarter
|
||||
}
|
||||
|
||||
#endif
|
||||
381
algo/scryptjane/scrypt-jane-mix_salsa-avx.h
Normal file
381
algo/scryptjane/scrypt-jane-mix_salsa-avx.h
Normal file
@@ -0,0 +1,381 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,32)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[ecx+eax+0])
|
||||
a3(vpxor xmm1,xmm1,[ecx+eax+16])
|
||||
a3(vpxor xmm2,xmm2,[ecx+eax+32])
|
||||
a3(vpxor xmm3,xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and eax, eax)
|
||||
a3(vpxor xmm0,xmm0,[esi+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[esi+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[esi+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[eax+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[eax+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[eax+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [esp+0],xmm0)
|
||||
a2(vmovdqa [esp+16],xmm1)
|
||||
a2(vmovdqa xmm6,xmm2)
|
||||
a2(vmovdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_salsa_avx_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a2(sub eax, 2)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a1(ja scrypt_salsa_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,[esp+0])
|
||||
a3(vpaddd xmm1,xmm1,[esp+16])
|
||||
a3(vpaddd xmm2,xmm2,xmm6)
|
||||
a3(vpaddd xmm3,xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(vmovdqa [eax+0],xmm0)
|
||||
a2(vmovdqa [eax+16],xmm1)
|
||||
a2(vmovdqa [eax+32],xmm2)
|
||||
a2(vmovdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa xmm8,xmm0)
|
||||
a2(vmovdqa xmm9,xmm1)
|
||||
a2(vmovdqa xmm10,xmm2)
|
||||
a2(vmovdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa_avx_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a2(sub rax, 2)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a1(ja scrypt_salsa_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,xmm8)
|
||||
a3(vpaddd xmm1,xmm1,xmm9)
|
||||
a3(vpaddd xmm2,xmm2,xmm10)
|
||||
a3(vpaddd xmm3,xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x4 = x1;
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x0;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x3;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x2;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x4 = x3;
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x39);
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x0;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x1;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x2;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x39);
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
/* uses salsa_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa/8-AVX"
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#endif
|
||||
443
algo/scryptjane/scrypt-jane-mix_salsa-sse2.h
Normal file
443
algo/scryptjane/scrypt-jane-mix_salsa-sse2.h
Normal file
@@ -0,0 +1,443 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,32)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[ecx+eax+0])
|
||||
a2(pxor xmm1,[ecx+eax+16])
|
||||
a2(pxor xmm2,[ecx+eax+32])
|
||||
a2(pxor xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and eax, eax)
|
||||
a2(pxor xmm0,[esi+ecx+0])
|
||||
a2(pxor xmm1,[esi+ecx+16])
|
||||
a2(pxor xmm2,[esi+ecx+32])
|
||||
a2(pxor xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[eax+ecx+0])
|
||||
a2(pxor xmm1,[eax+ecx+16])
|
||||
a2(pxor xmm2,[eax+ecx+32])
|
||||
a2(pxor xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [esp+0],xmm0)
|
||||
a2(movdqa [esp+16],xmm1)
|
||||
a2(movdqa xmm6,xmm2)
|
||||
a2(movdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_salsa_sse2_loop: )
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(sub eax, 2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a1(ja scrypt_salsa_sse2_loop)
|
||||
a2(paddd xmm0,[esp+0])
|
||||
a2(paddd xmm1,[esp+16])
|
||||
a2(paddd xmm2,xmm6)
|
||||
a2(paddd xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(movdqa [eax+0],xmm0)
|
||||
a2(movdqa [eax+16],xmm1)
|
||||
a2(movdqa [eax+32],xmm2)
|
||||
a2(movdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa xmm8,xmm0)
|
||||
a2(movdqa xmm9,xmm1)
|
||||
a2(movdqa xmm10,xmm2)
|
||||
a2(movdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa_sse2_loop: )
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a1(ja scrypt_salsa_sse2_loop)
|
||||
a2(paddd xmm0,xmm8)
|
||||
a2(paddd xmm1,xmm9)
|
||||
a2(paddd xmm2,xmm10)
|
||||
a2(paddd xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x4 = x1;
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x0;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x3;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x2;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x4 = x3;
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x39);
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x0;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x1;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x2;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x39);
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa/8-SSE2"
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#endif
|
||||
|
||||
/* used by avx,etc as well */
|
||||
#if defined(SCRYPT_SALSA_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
|
||||
uint32_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
70
algo/scryptjane/scrypt-jane-mix_salsa.h
Normal file
70
algo/scryptjane/scrypt-jane-mix_salsa.h
Normal file
@@ -0,0 +1,70 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa20/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_BASIC
|
||||
|
||||
static void
|
||||
salsa_core_basic(uint32_t state[16]) {
|
||||
size_t rounds = 8;
|
||||
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
|
||||
|
||||
x0 = state[0];
|
||||
x1 = state[1];
|
||||
x2 = state[2];
|
||||
x3 = state[3];
|
||||
x4 = state[4];
|
||||
x5 = state[5];
|
||||
x6 = state[6];
|
||||
x7 = state[7];
|
||||
x8 = state[8];
|
||||
x9 = state[9];
|
||||
x10 = state[10];
|
||||
x11 = state[11];
|
||||
x12 = state[12];
|
||||
x13 = state[13];
|
||||
x14 = state[14];
|
||||
x15 = state[15];
|
||||
|
||||
#define quarter(a,b,c,d) \
|
||||
t = a+d; t = ROTL32(t, 7); b ^= t; \
|
||||
t = b+a; t = ROTL32(t, 9); c ^= t; \
|
||||
t = c+b; t = ROTL32(t, 13); d ^= t; \
|
||||
t = d+c; t = ROTL32(t, 18); a ^= t; \
|
||||
|
||||
for (; rounds; rounds -= 2) {
|
||||
quarter( x0, x4, x8,x12)
|
||||
quarter( x5, x9,x13, x1)
|
||||
quarter(x10,x14, x2, x6)
|
||||
quarter(x15, x3, x7,x11)
|
||||
quarter( x0, x1, x2, x3)
|
||||
quarter( x5, x6, x7, x4)
|
||||
quarter(x10,x11, x8, x9)
|
||||
quarter(x15,x12,x13,x14)
|
||||
}
|
||||
|
||||
state[0] += x0;
|
||||
state[1] += x1;
|
||||
state[2] += x2;
|
||||
state[3] += x3;
|
||||
state[4] += x4;
|
||||
state[5] += x5;
|
||||
state[6] += x6;
|
||||
state[7] += x7;
|
||||
state[8] += x8;
|
||||
state[9] += x9;
|
||||
state[10] += x10;
|
||||
state[11] += x11;
|
||||
state[12] += x12;
|
||||
state[13] += x13;
|
||||
state[14] += x14;
|
||||
state[15] += x15;
|
||||
|
||||
#undef quarter
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
367
algo/scryptjane/scrypt-jane-mix_salsa64-avx.h
Normal file
367
algo/scryptjane/scrypt-jane-mix_salsa64-avx.h
Normal file
@@ -0,0 +1,367 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_avx_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vpsrlq xmm8, xmm10, 51)
|
||||
a3(vpsrlq xmm9, xmm11, 51)
|
||||
a3(vpsllq xmm10, xmm10, 13)
|
||||
a3(vpsllq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm8)
|
||||
a3(vpxor xmm5, xmm5, xmm9)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vpsrlq xmm10, xmm8, 25)
|
||||
a3(vpsrlq xmm11, xmm9, 25)
|
||||
a3(vpsllq xmm8, xmm8, 39)
|
||||
a3(vpsllq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a2(sub rax, 2)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vpsrlq xmm10, xmm8, 51)
|
||||
a3(vpsrlq xmm11, xmm9, 51)
|
||||
a3(vpsllq xmm8, xmm8, 13)
|
||||
a3(vpsllq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm10)
|
||||
a3(vpxor xmm4, xmm4, xmm11)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vpsrlq xmm8, xmm10, 25)
|
||||
a3(vpsrlq xmm9, xmm11, 25)
|
||||
a3(vpsllq xmm10, xmm10, 39)
|
||||
a3(vpsllq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a1(ja scrypt_salsa64_avx_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
449
algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h
Normal file
449
algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h
Normal file
@@ -0,0 +1,449 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_sse2_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
a1(ja scrypt_salsa64_sse2_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2)
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSE2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
|
||||
/* sse3/avx use this as well */
|
||||
#if defined(SCRYPT_SALSA64_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
|
||||
uint64_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
399
algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h
Normal file
399
algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h
Normal file
@@ -0,0 +1,399 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_ssse3_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
a1(ja scrypt_salsa64_ssse3_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
a1(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3)
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSSE3"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
41
algo/scryptjane/scrypt-jane-mix_salsa64.h
Normal file
41
algo/scryptjane/scrypt-jane-mix_salsa64.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_BASIC
|
||||
|
||||
static void
|
||||
salsa64_core_basic(uint64_t state[16]) {
|
||||
const size_t rounds = 8;
|
||||
uint64_t v[16], t;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) v[i] = state[i];
|
||||
|
||||
#define G(a,b,c,d) \
|
||||
t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
|
||||
t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
|
||||
t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
|
||||
t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
|
||||
|
||||
for (i = 0; i < rounds; i += 2) {
|
||||
G( 0, 4, 8,12);
|
||||
G( 5, 9,13, 1);
|
||||
G(10,14, 2, 6);
|
||||
G(15, 3, 7,11);
|
||||
G( 0, 1, 2, 3);
|
||||
G( 5, 6, 7, 4);
|
||||
G(10,11, 8, 9);
|
||||
G(15,12,13,14);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++) state[i] += v[i];
|
||||
|
||||
#undef G
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
161
algo/scryptjane/scrypt-jane-pbkdf2.h
Normal file
161
algo/scryptjane/scrypt-jane-pbkdf2.h
Normal file
@@ -0,0 +1,161 @@
|
||||
typedef struct scrypt_hmac_state_t {
|
||||
scrypt_hash_state inner, outer;
|
||||
} scrypt_hmac_state;
|
||||
|
||||
|
||||
static void
|
||||
scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_init(&st);
|
||||
scrypt_hash_update(&st, m, mlen);
|
||||
scrypt_hash_finish(&st, hash);
|
||||
}
|
||||
|
||||
/* hmac */
|
||||
static void
|
||||
scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
|
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
|
||||
size_t i;
|
||||
|
||||
scrypt_hash_init(&st->inner);
|
||||
scrypt_hash_init(&st->outer);
|
||||
|
||||
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* use the key directly if it's <= blocksize bytes */
|
||||
memcpy(pad, key, keylen);
|
||||
} else {
|
||||
/* if it's > blocksize bytes, hash it */
|
||||
scrypt_hash(pad, key, keylen);
|
||||
}
|
||||
|
||||
/* inner = (key ^ 0x36) */
|
||||
/* h(inner || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= 0x36;
|
||||
scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
/* outer = (key ^ 0x5c) */
|
||||
/* h(outer || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= (0x5c ^ 0x36);
|
||||
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(pad, sizeof(pad));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
|
||||
/* h(inner || m...) */
|
||||
scrypt_hash_update(&st->inner, m, mlen);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
|
||||
/* h(inner || m) */
|
||||
scrypt_hash_digest innerhash;
|
||||
scrypt_hash_finish(&st->inner, innerhash);
|
||||
|
||||
/* h(outer || h(inner || m)) */
|
||||
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
|
||||
scrypt_hash_finish(&st->outer, mac);
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(st, sizeof(*st));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
|
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
|
||||
scrypt_hash_digest ti, u;
|
||||
uint8_t be[4];
|
||||
uint32_t i, j, blocks;
|
||||
uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
|
||||
|
||||
/* hmac(password, ...) */
|
||||
scrypt_hmac_init(&hmac_pw, password, password_len);
|
||||
|
||||
/* hmac(password, salt...) */
|
||||
hmac_pw_salt = hmac_pw;
|
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
|
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
|
||||
for (i = 1; i <= blocks; i++) {
|
||||
/* U1 = hmac(password, salt || be(i)) */
|
||||
U32TO8_BE(be, i);
|
||||
work = hmac_pw_salt;
|
||||
scrypt_hmac_update(&work, be, 4);
|
||||
scrypt_hmac_finish(&work, ti);
|
||||
memcpy(u, ti, sizeof(u));
|
||||
|
||||
/* T[i] = U1 ^ U2 ^ U3... */
|
||||
for (c = 0; c < N - 1; c++) {
|
||||
/* UX = hmac(password, U{X-1}) */
|
||||
work = hmac_pw;
|
||||
scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
|
||||
scrypt_hmac_finish(&work, u);
|
||||
|
||||
/* T[i] ^= UX */
|
||||
for (j = 0; j < sizeof(u); j++)
|
||||
ti[j] ^= u[j];
|
||||
}
|
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
|
||||
out += SCRYPT_HASH_DIGEST_SIZE;
|
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE;
|
||||
}
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(ti, sizeof(ti));
|
||||
scrypt_ensure_zero(u, sizeof(u));
|
||||
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
|
||||
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version where N = 1
|
||||
* - mikaelh
|
||||
*/
|
||||
static void
|
||||
scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) {
|
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
|
||||
scrypt_hash_digest ti, u;
|
||||
uint8_t be[4];
|
||||
uint32_t i, /*j,*/ blocks;
|
||||
//uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
|
||||
|
||||
/* hmac(password, ...) */
|
||||
scrypt_hmac_init(&hmac_pw, password, password_len);
|
||||
|
||||
/* hmac(password, salt...) */
|
||||
hmac_pw_salt = hmac_pw;
|
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
|
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
|
||||
for (i = 1; i <= blocks; i++) {
|
||||
/* U1 = hmac(password, salt || be(i)) */
|
||||
U32TO8_BE(be, i);
|
||||
work = hmac_pw_salt;
|
||||
scrypt_hmac_update(&work, be, 4);
|
||||
scrypt_hmac_finish(&work, ti);
|
||||
memcpy(u, ti, sizeof(u));
|
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
|
||||
out += SCRYPT_HASH_DIGEST_SIZE;
|
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE;
|
||||
}
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(ti, sizeof(ti));
|
||||
scrypt_ensure_zero(u, sizeof(u));
|
||||
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
|
||||
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
|
||||
#endif
|
||||
}
|
||||
391
algo/scryptjane/scrypt-jane-portable-x86.h
Normal file
391
algo/scryptjane/scrypt-jane-portable-x86.h
Normal file
@@ -0,0 +1,391 @@
|
||||
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
|
||||
#define X86ASM
|
||||
/* gcc 2.95 royally screws up stack alignments on variables */
|
||||
#if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
|
||||
#define X86ASM_SSE
|
||||
#define X86ASM_SSE2
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
|
||||
#define X86ASM_SSSE3
|
||||
#endif
|
||||
#if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
|
||||
#define X86ASM_AVX
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) && defined(COMPILER_GCC)
|
||||
#define X86_64ASM
|
||||
#define X86_64ASM_SSE2
|
||||
#if (COMPILER_GCC >= 40102)
|
||||
#define X86_64ASM_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40400)
|
||||
#define X86_64ASM_AVX
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= 1400)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
|
||||
#define X86_64USE_INTRINSIC
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
|
||||
#define X86_64USE_INTRINSIC
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(__SSE__)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(__SSE2__)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if defined(__SSSE3__)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if defined(__AVX__)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
|
||||
/* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_AVX
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86ASM_SSE
|
||||
#endif
|
||||
|
||||
/* only use simd on windows (or SSE2 on gcc)! */
|
||||
#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
|
||||
#if defined(X86_INTRINSIC_SSE)
|
||||
#define X86_INTRINSIC
|
||||
#include <mmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
typedef __m64 qmm;
|
||||
typedef __m128 xmm;
|
||||
typedef __m128d xmmd;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#include <emmintrin.h>
|
||||
typedef __m128i xmmi;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined (X86_INTRINSIC_AVX)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
xmmi v;
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
xmmi v;
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
xmmi v;
|
||||
} packedelem64;
|
||||
#else
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
uint32_t dw[4];
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
uint8_t b[16];
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
uint8_t b[16];
|
||||
} packedelem64;
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
|
||||
static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
|
||||
#endif
|
||||
|
||||
/*
|
||||
x86 inline asm for gcc/msvc. usage:
|
||||
|
||||
asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
|
||||
asm_naked_fn(name)
|
||||
a1(..)
|
||||
a2(.., ..)
|
||||
a3(.., .., ..)
|
||||
64bit OR 0 paramters: a1(ret)
|
||||
32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
|
||||
asm_naked_fn_end(name)
|
||||
*/
|
||||
|
||||
#if defined(X86ASM) || defined(X86_64ASM)
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
|
||||
#define a1(x) __asm {x}
|
||||
#define a2(x, y) __asm {x, y}
|
||||
#define a3(x, y, z) __asm {x, y, z}
|
||||
#define a4(x, y, z, w) __asm {x, y, z, w}
|
||||
#define al(x) __asm {label##x:}
|
||||
#define aj(x, y, z) __asm {x label##y}
|
||||
#define asm_align8 a1(ALIGN 8)
|
||||
#define asm_align16 a1(ALIGN 16)
|
||||
|
||||
#define asm_calling_convention STDCALL
|
||||
#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) {
|
||||
#define asm_naked_fn_end(fn) }
|
||||
#elif defined(COMPILER_GCC)
|
||||
#define GNU_AS1(x) #x ";\n"
|
||||
#define GNU_AS2(x, y) #x ", " #y ";\n"
|
||||
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
|
||||
#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
|
||||
#define GNU_ASL(x) "\n" #x ":\n"
|
||||
#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
|
||||
#define GNU_ASJ(x, y, z) #x " " #y #z ";"
|
||||
|
||||
#define a1(x) GNU_AS1(x)
|
||||
#define a2(x, y) GNU_AS2(x, y)
|
||||
#define a3(x, y, z) GNU_AS3(x, y, z)
|
||||
#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
|
||||
#define al(x) GNU_ASL(x)
|
||||
#define aj(x, y, z) GNU_ASJ(x, y, z)
|
||||
#define asm_align8 a1(.align 8)
|
||||
#define asm_align16 a1(.align 16)
|
||||
|
||||
#if defined(OS_WINDOWS)
|
||||
#define asm_calling_convention CDECL
|
||||
#define aret(n) a1(ret)
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
|
||||
#else
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
|
||||
#endif
|
||||
#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
|
||||
#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
|
||||
#define asm_gcc_parms() ".att_syntax prefix;"
|
||||
#define asm_gcc_trashed() __asm__ __volatile__("" :::
|
||||
#define asm_gcc_end() );
|
||||
#else
|
||||
need x86 asm
|
||||
#endif
|
||||
|
||||
#endif /* X86ASM || X86_64ASM */
|
||||
|
||||
|
||||
#if defined(CPU_X86) || defined(CPU_X86_64)
|
||||
|
||||
typedef enum cpu_flags_x86_t {
|
||||
cpu_mmx = 1 << 0,
|
||||
cpu_sse = 1 << 1,
|
||||
cpu_sse2 = 1 << 2,
|
||||
cpu_sse3 = 1 << 3,
|
||||
cpu_ssse3 = 1 << 4,
|
||||
cpu_sse4_1 = 1 << 5,
|
||||
cpu_sse4_2 = 1 << 6,
|
||||
cpu_avx = 1 << 7
|
||||
} cpu_flags_x86;
|
||||
|
||||
typedef enum cpu_vendors_x86_t {
|
||||
cpu_nobody,
|
||||
cpu_intel,
|
||||
cpu_amd
|
||||
} cpu_vendors_x86;
|
||||
|
||||
typedef struct x86_regs_t {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
} x86_regs;
|
||||
|
||||
#if defined(X86ASM)
|
||||
asm_naked_fn_proto(int, has_cpuid)(void)
|
||||
asm_naked_fn(has_cpuid)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(mov ecx, eax)
|
||||
a2(xor eax, 0x200000)
|
||||
a1(push eax)
|
||||
a1(popfd)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(xor eax, ecx)
|
||||
a2(shr eax, 21)
|
||||
a2(and eax, 1)
|
||||
a1(push ecx)
|
||||
a1(popfd)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(has_cpuid)
|
||||
#endif /* X86ASM */
|
||||
|
||||
|
||||
static void NOINLINE
|
||||
get_cpuid(x86_regs *regs, uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
__cpuid((int *)regs, (int)flags);
|
||||
#else
|
||||
#if defined(CPU_X86_64)
|
||||
#define cpuid_bx rbx
|
||||
#else
|
||||
#define cpuid_bx ebx
|
||||
#endif
|
||||
|
||||
asm_gcc()
|
||||
a1(push cpuid_bx)
|
||||
a1(cpuid)
|
||||
a2(mov [%1 + 0], eax)
|
||||
a2(mov [%1 + 4], ebx)
|
||||
a2(mov [%1 + 8], ecx)
|
||||
a2(mov [%1 + 12], edx)
|
||||
a1(pop cpuid_bx)
|
||||
asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc"
|
||||
asm_gcc_end()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
static uint64_t NOINLINE
|
||||
get_xgetbv(uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
return _xgetbv(flags);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
asm_gcc()
|
||||
a1(xgetbv)
|
||||
asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
|
||||
asm_gcc_end()
|
||||
return ((uint64_t)lo | ((uint64_t)hi << 32));
|
||||
#endif
|
||||
}
|
||||
#endif // AVX support
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
size_t cpu_detect_mask = (size_t)-1;
|
||||
#endif
|
||||
|
||||
static size_t
|
||||
detect_cpu(void) {
|
||||
union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
cpu_vendors_x86 vendor = cpu_nobody;
|
||||
x86_regs regs;
|
||||
uint32_t max_level;
|
||||
size_t cpu_flags = 0;
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
uint64_t xgetbv_flags;
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86)
|
||||
if (!has_cpuid())
|
||||
return cpu_flags;
|
||||
#endif
|
||||
|
||||
get_cpuid(®s, 0);
|
||||
max_level = regs.eax;
|
||||
vendor_string.i[0] = regs.ebx;
|
||||
vendor_string.i[1] = regs.edx;
|
||||
vendor_string.i[2] = regs.ecx;
|
||||
|
||||
if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
|
||||
vendor = cpu_intel;
|
||||
else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
|
||||
vendor = cpu_amd;
|
||||
|
||||
if (max_level & 0x00000500) {
|
||||
/* "Intel P5 pre-B0" */
|
||||
cpu_flags |= cpu_mmx;
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
if (max_level < 1)
|
||||
return cpu_flags;
|
||||
|
||||
get_cpuid(®s, 1);
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
/* xsave/xrestore */
|
||||
if (regs.ecx & (1 << 27)) {
|
||||
xgetbv_flags = get_xgetbv(0);
|
||||
if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
|
||||
}
|
||||
#endif
|
||||
if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3;
|
||||
if (regs.ecx & (1 )) cpu_flags |= cpu_sse3;
|
||||
if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
|
||||
if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
|
||||
if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
cpu_flags &= cpu_detect_mask;
|
||||
#endif
|
||||
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static const char *
|
||||
get_top_cpuflag_desc(size_t flag) {
|
||||
if (flag & cpu_avx) return "AVX";
|
||||
else if (flag & cpu_sse4_2) return "SSE4.2";
|
||||
else if (flag & cpu_sse4_1) return "SSE4.1";
|
||||
else if (flag & cpu_ssse3) return "SSSE3";
|
||||
else if (flag & cpu_sse2) return "SSE2";
|
||||
else if (flag & cpu_sse) return "SSE";
|
||||
else if (flag & cpu_mmx) return "MMX";
|
||||
else return "Basic";
|
||||
}
|
||||
#endif
|
||||
|
||||
/* enable the highest system-wide option */
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#if !defined(__AVX__)
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86ASM_AVX
|
||||
#undef X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if !defined(__SSSE3__)
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if !defined(__SSE2__)
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
|
||||
280
algo/scryptjane/scrypt-jane-portable.h
Normal file
280
algo/scryptjane/scrypt-jane-portable.h
Normal file
@@ -0,0 +1,280 @@
|
||||
/* determine os */
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#include <wincrypt.h>
|
||||
#define OS_WINDOWS
|
||||
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_SOLARIS
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/param.h> /* need this to define BSD */
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_NIX
|
||||
#if defined(__linux__)
|
||||
#include <endian.h>
|
||||
#define OS_LINUX
|
||||
#elif defined(BSD)
|
||||
#define OS_BSD
|
||||
|
||||
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
|
||||
#define OS_OSX
|
||||
#elif defined(macintosh) || defined(Macintosh)
|
||||
#define OS_MAC
|
||||
#elif defined(__OpenBSD__)
|
||||
#define OS_OPENBSD
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* determine compiler */
|
||||
#if defined(_MSC_VER)
|
||||
#define COMPILER_MSVC _MSC_VER
|
||||
#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
|
||||
#define COMPILER_MSVC6PP_AND_LATER
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= 1500)
|
||||
#define COMPILER_HAS_TMMINTRIN
|
||||
#endif
|
||||
|
||||
#pragma warning(disable : 4127) /* conditional expression is constant */
|
||||
#pragma warning(disable : 4100) /* unreferenced formal parameter */
|
||||
|
||||
#include <float.h>
|
||||
#include <stdlib.h> /* _rotl */
|
||||
#include <intrin.h>
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef signed __int64 int64_t;
|
||||
|
||||
#define ROTL32(a,b) _rotl(a,b)
|
||||
#define ROTR32(a,b) _rotr(a,b)
|
||||
#define ROTL64(a,b) _rotl64(a,b)
|
||||
#define ROTR64(a,b) _rotr64(a,b)
|
||||
#undef NOINLINE
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#undef INLINE
|
||||
#define INLINE __forceinline
|
||||
#undef FASTCALL
|
||||
#define FASTCALL __fastcall
|
||||
#undef CDECL
|
||||
#define CDECL __cdecl
|
||||
#undef STDCALL
|
||||
#define STDCALL __stdcall
|
||||
#undef NAKED
|
||||
#define NAKED __declspec(naked)
|
||||
#define MM16 __declspec(align(16))
|
||||
#endif
|
||||
#if defined(__ICC)
|
||||
#define COMPILER_INTEL
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
#if (__GNUC__ >= 3)
|
||||
#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
|
||||
#else
|
||||
#define COMPILER_GCC_PATCHLEVEL 0
|
||||
#endif
|
||||
#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
|
||||
#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
|
||||
#undef NOINLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
#undef INLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define INLINE __attribute__((always_inline))
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#undef FASTCALL
|
||||
#if (COMPILER_GCC >= 30400)
|
||||
#define FASTCALL __attribute__((fastcall))
|
||||
#else
|
||||
#define FASTCALL
|
||||
#endif
|
||||
#undef CDECL
|
||||
#define CDECL __attribute__((cdecl))
|
||||
#undef STDCALL
|
||||
#define STDCALL __attribute__((stdcall))
|
||||
#define MM16 __attribute__((aligned(16)))
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__)
|
||||
#define COMPILER_MINGW
|
||||
#endif
|
||||
#if defined(__PATHCC__)
|
||||
#define COMPILER_PATHCC
|
||||
#endif
|
||||
|
||||
#define OPTIONAL_INLINE
|
||||
#if defined(OPTIONAL_INLINE)
|
||||
#undef OPTIONAL_INLINE
|
||||
#define OPTIONAL_INLINE INLINE
|
||||
#else
|
||||
#define OPTIONAL_INLINE
|
||||
#endif
|
||||
|
||||
#define CRYPTO_FN NOINLINE STDCALL
|
||||
|
||||
/* determine cpu */
|
||||
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
|
||||
#define CPU_X86_64
|
||||
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
|
||||
#define CPU_X86 500
|
||||
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
|
||||
#define CPU_X86 400
|
||||
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
|
||||
#define CPU_X86 300
|
||||
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
|
||||
#define CPU_IA64
|
||||
#endif
|
||||
|
||||
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
|
||||
#define CPU_SPARC
|
||||
#if defined(__sparcv9)
|
||||
#define CPU_SPARC64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
|
||||
#define CPU_64BITS
|
||||
#undef FASTCALL
|
||||
#define FASTCALL
|
||||
#undef CDECL
|
||||
#define CDECL
|
||||
#undef STDCALL
|
||||
#define STDCALL
|
||||
#endif
|
||||
|
||||
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
|
||||
#define CPU_PPC
|
||||
#if defined(_ARCH_PWR7)
|
||||
#define CPU_POWER7
|
||||
#elif defined(__64BIT__)
|
||||
#define CPU_PPC64
|
||||
#else
|
||||
#define CPU_PPC32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__hppa__) || defined(__hppa)
|
||||
#define CPU_HPPA
|
||||
#endif
|
||||
|
||||
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
|
||||
#define CPU_ALPHA
|
||||
#endif
|
||||
|
||||
/* endian */
|
||||
|
||||
#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
|
||||
(defined(CPU_X86) || defined(CPU_X86_64)) || \
|
||||
(defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
|
||||
#define CPU_LE
|
||||
#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
|
||||
(defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
|
||||
#define CPU_BE
|
||||
#else
|
||||
/* unknown endian! */
|
||||
#endif
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
|
||||
|
||||
#define U8TO32_LE(p) \
|
||||
(((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \
|
||||
((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
|
||||
|
||||
#define U32TO8_BE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
|
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
|
||||
|
||||
#define U32TO8_LE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \
|
||||
(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
|
||||
|
||||
#define U8TO64_BE(p) \
|
||||
(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
|
||||
|
||||
#define U8TO64_LE(p) \
|
||||
(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
|
||||
|
||||
#define U64TO8_BE(p, v) \
|
||||
U32TO8_BE((p), (uint32_t)((v) >> 32)); \
|
||||
U32TO8_BE((p) + 4, (uint32_t)((v) ));
|
||||
|
||||
#define U64TO8_LE(p, v) \
|
||||
U32TO8_LE((p), (uint32_t)((v) )); \
|
||||
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
|
||||
|
||||
#define U32_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \
|
||||
(v) = ((v) << 16) | ((v) >> 16); \
|
||||
}
|
||||
|
||||
#define U64_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \
|
||||
(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \
|
||||
(v) = ((v) << 32) | ((v) >> 32); \
|
||||
}
|
||||
|
||||
static int
|
||||
scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
|
||||
uint32_t differentbits = 0;
|
||||
while (len--)
|
||||
differentbits |= (*x++ ^ *y++);
|
||||
return (1 & ((differentbits - 1) >> 8));
|
||||
}
|
||||
|
||||
void
|
||||
scrypt_ensure_zero(void *p, size_t len) {
|
||||
#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
|
||||
__stosb((unsigned char *)p, 0, len);
|
||||
#elif (defined(CPU_X86) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushl %%edi;\n"
|
||||
"pushl %%ecx;\n"
|
||||
"rep stosb;\n"
|
||||
"popl %%ecx;\n"
|
||||
"popl %%edi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushq %%rdi;\n"
|
||||
"pushq %%rcx;\n"
|
||||
"rep stosb;\n"
|
||||
"popq %%rcx;\n"
|
||||
"popq %%rdi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#else
|
||||
volatile uint8_t *b = (volatile uint8_t *)p;
|
||||
size_t i;
|
||||
for (i = 0; i < len; i++)
|
||||
b[i] = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "scrypt-jane-portable-x86.h"
|
||||
|
||||
67
algo/scryptjane/scrypt-jane-romix-basic.h
Normal file
67
algo/scryptjane/scrypt-jane-romix-basic.h
Normal file
@@ -0,0 +1,67 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
/* function type returned by scrypt_getROMix, used with cpu detection */
|
||||
typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
|
||||
#endif
|
||||
|
||||
/* romix pre/post nop function */
|
||||
static void /* asm_calling_convention */
|
||||
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
}
|
||||
|
||||
/* romix pre/post endian conversion function */
|
||||
static void /* asm_calling_convention */
|
||||
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
#if !defined(CPU_LE)
|
||||
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
|
||||
size_t i;
|
||||
if (endian_test.w == 0x100) {
|
||||
nblocks *= SCRYPT_BLOCK_WORDS;
|
||||
for (i = 0; i < nblocks; i++) {
|
||||
SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* chunkmix test function */
|
||||
typedef void (*chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
|
||||
typedef void (*blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
|
||||
|
||||
static int
|
||||
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
|
||||
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
|
||||
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
|
||||
scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
uint8_t final[16];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < words; i++) {
|
||||
v = (scrypt_mix_word_t)i;
|
||||
v = (v << 8) | v;
|
||||
v = (v << 16) | v;
|
||||
chunk[0][i] = v;
|
||||
}
|
||||
|
||||
prefn(chunk[0], blocks);
|
||||
mixfn(chunk[1], chunk[0], NULL, r);
|
||||
postfn(chunk[1], blocks);
|
||||
|
||||
/* grab the last 16 bytes of the final block */
|
||||
for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
|
||||
SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
|
||||
}
|
||||
|
||||
return scrypt_verify(expected, final, 16);
|
||||
}
|
||||
|
||||
/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
|
||||
return base + (i * len);
|
||||
}
|
||||
|
||||
/* returns a pointer to block i */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
|
||||
return base + (i * SCRYPT_BLOCK_WORDS);
|
||||
}
|
||||
179
algo/scryptjane/scrypt-jane-romix-template.h
Normal file
179
algo/scryptjane/scrypt-jane-romix-template.h
Normal file
@@ -0,0 +1,179 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
|
||||
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix
|
||||
#endif
|
||||
|
||||
#undef SCRYPT_HAVE_ROMIX
|
||||
#define SCRYPT_HAVE_ROMIX
|
||||
|
||||
#if !defined(SCRYPT_CHUNKMIX_FN)
|
||||
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
|
||||
|
||||
/*
|
||||
Bout = ChunkMix(Bin)
|
||||
|
||||
2*r: number of blocks in the chunk
|
||||
*/
|
||||
static void /* asm_calling_convention */
|
||||
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
|
||||
scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
|
||||
uint32_t i, j, blocksPerChunk = r * 2, half = 0;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
block = scrypt_block(Bin, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] = block[i];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] ^= block[i];
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
block = scrypt_block(Bin, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
}
|
||||
SCRYPT_MIX_FN(X);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
block = scrypt_block(Bout, (i / 2) + half);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
block[j] = X[j];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
X = ROMix(X)
|
||||
|
||||
X: chunk to mix
|
||||
Y: scratch chunk
|
||||
N: number of rounds
|
||||
V[N]: array of chunks to randomly index in to
|
||||
2*r: number of blocks in a chunk
|
||||
*/
|
||||
|
||||
static void NOINLINE FASTCALL
|
||||
SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
|
||||
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
|
||||
scrypt_mix_word_t *block = V;
|
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
|
||||
|
||||
/* 1: X = B */
|
||||
/* implicit */
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
|
||||
for (i = 0; i < N - 1; i++, block += chunkWords) {
|
||||
/* 3: V_i = X */
|
||||
/* 4: X = H(X) */
|
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
|
||||
}
|
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
|
||||
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < N; i += 2) {
|
||||
/* 7: j = Integerify(X) % N */
|
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
|
||||
|
||||
/* 7: j = Integerify(Y) % N */
|
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
|
||||
}
|
||||
|
||||
/* 10: B' = X */
|
||||
/* implicit */
|
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with hard-coded r = 1
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE FASTCALL
|
||||
scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
|
||||
scrypt_mix_word_t *block = V;
|
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
|
||||
|
||||
/* 1: X = B */
|
||||
/* implicit */
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
|
||||
for (i = 0; i < N - 1; i++, block += chunkWords) {
|
||||
/* 3: V_i = X */
|
||||
/* 4: X = H(X) */
|
||||
#ifdef SCRYPT_CHUNKMIX_1_FN
|
||||
SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block);
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
|
||||
#endif
|
||||
}
|
||||
#ifdef SCRYPT_CHUNKMIX_1_FN
|
||||
SCRYPT_CHUNKMIX_1_FN(X, block);
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
|
||||
#endif
|
||||
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < N; i += 2) {
|
||||
/* 7: j = Integerify(X) % N */
|
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
|
||||
SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords));
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
|
||||
#endif
|
||||
|
||||
/* 7: j = Integerify(Y) % N */
|
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
|
||||
SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords));
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 10: B' = X */
|
||||
/* implicit */
|
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
|
||||
}
|
||||
|
||||
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
|
||||
|
||||
|
||||
#undef SCRYPT_CHUNKMIX_FN
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#undef SCRYPT_MIX_FN
|
||||
#undef SCRYPT_ROMIX_TANGLE_FN
|
||||
#undef SCRYPT_ROMIX_UNTANGLE_FN
|
||||
|
||||
27
algo/scryptjane/scrypt-jane-romix.h
Normal file
27
algo/scryptjane/scrypt-jane-romix.h
Normal file
@@ -0,0 +1,27 @@
|
||||
#if defined(SCRYPT_CHACHA)
|
||||
#include "scrypt-jane-chacha.h"
|
||||
#elif defined(SCRYPT_SALSA)
|
||||
#include "scrypt-jane-salsa.h"
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
#include "scrypt-jane-salsa64.h"
|
||||
#else
|
||||
#define SCRYPT_MIX_BASE "ERROR"
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
|
||||
static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; }
|
||||
#else
|
||||
static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
|
||||
#endif
|
||||
static int scrypt_test_mix() { return 0; }
|
||||
#error must define a mix function!
|
||||
#endif
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX SCRYPT_MIX_BASE
|
||||
#endif
|
||||
109
algo/scryptjane/scrypt-jane-salsa.h
Normal file
109
algo/scryptjane/scrypt-jane-salsa.h
Normal file
@@ -0,0 +1,109 @@
|
||||
#define SCRYPT_MIX_BASE "Salsa20/8"
|
||||
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa-avx.h"
|
||||
#include "scrypt-jane-mix_salsa-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_MIX_FN salsa_core_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static int
|
||||
scrypt_test_mix() {
|
||||
static const uint8_t expected[16] = {
|
||||
0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
133
algo/scryptjane/scrypt-jane-salsa64.h
Normal file
133
algo/scryptjane/scrypt-jane-salsa64.h
Normal file
@@ -0,0 +1,133 @@
|
||||
#define SCRYPT_MIX_BASE "Salsa64/8"
|
||||
|
||||
typedef uint64_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U64TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 128
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa64-avx.h"
|
||||
#include "scrypt-jane-mix_salsa64-ssse3.h"
|
||||
#include "scrypt-jane-mix_salsa64-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa64.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa64_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
return scrypt_ROMix_ssse3;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
flags |= cpu_ssse3;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
scrypt_test_mix() {
|
||||
static const uint8_t expected[16] = {
|
||||
0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
266
algo/scryptjane/scrypt-jane-test-vectors.h
Normal file
266
algo/scryptjane/scrypt-jane-test-vectors.h
Normal file
@@ -0,0 +1,266 @@
|
||||
typedef struct scrypt_test_setting_t {
|
||||
const char *pw, *salt;
|
||||
uint8_t Nfactor, rfactor, pfactor;
|
||||
} scrypt_test_setting;
|
||||
|
||||
/*
|
||||
* I'm hardcoding the values of p and r, which means they can't be tested
|
||||
* anymore. A new test case with a different value for N should maybe be added.
|
||||
* - mikaelh
|
||||
*/
|
||||
static const scrypt_test_setting post_settings[] = {
|
||||
{"", "", 3, 0, 0},
|
||||
// {"password", "NaCl", 9, 3, 4},
|
||||
{0}
|
||||
};
|
||||
|
||||
#if defined(SCRYPT_SHA256)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
/* sha256 + salsa20/8, the only 'official' test vectors! */
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97,
|
||||
0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42,
|
||||
0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17,
|
||||
0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06},
|
||||
{0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe,
|
||||
0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62,
|
||||
0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda,
|
||||
0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f,
|
||||
0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c,
|
||||
0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36,
|
||||
0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe},
|
||||
{0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6,
|
||||
0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45,
|
||||
0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67,
|
||||
0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf,
|
||||
0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6,
|
||||
0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95,
|
||||
0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31},
|
||||
{0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d,
|
||||
0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14,
|
||||
0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb,
|
||||
0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_SHA512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa,
|
||||
0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5,
|
||||
0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92,
|
||||
0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe},
|
||||
{0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06,
|
||||
0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74,
|
||||
0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85,
|
||||
0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f,
|
||||
0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc,
|
||||
0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb,
|
||||
0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2},
|
||||
{0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd,
|
||||
0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e,
|
||||
0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1,
|
||||
0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff,
|
||||
0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49,
|
||||
0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29,
|
||||
0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1},
|
||||
{0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5,
|
||||
0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61,
|
||||
0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86,
|
||||
0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_BLAKE512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20,
|
||||
0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11,
|
||||
0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89,
|
||||
0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87},
|
||||
{0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88,
|
||||
0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55,
|
||||
0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80,
|
||||
0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6,
|
||||
0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9,
|
||||
0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58,
|
||||
0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a},
|
||||
{0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c,
|
||||
0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b,
|
||||
0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4,
|
||||
0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4,
|
||||
0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99,
|
||||
0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5,
|
||||
0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1},
|
||||
{0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23,
|
||||
0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41,
|
||||
0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f,
|
||||
0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d},
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_BLAKE256)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16,
|
||||
0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00,
|
||||
0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57,
|
||||
0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c},
|
||||
{0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b,
|
||||
0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75,
|
||||
0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f,
|
||||
0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1,
|
||||
0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8,
|
||||
0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a,
|
||||
0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21},
|
||||
{0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf,
|
||||
0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f,
|
||||
0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90,
|
||||
0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3,
|
||||
0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27,
|
||||
0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4,
|
||||
0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba},
|
||||
{0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f,
|
||||
0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f,
|
||||
0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76,
|
||||
0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_SKEIN512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69,
|
||||
0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87,
|
||||
0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f,
|
||||
0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e},
|
||||
{0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e,
|
||||
0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b,
|
||||
0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb,
|
||||
0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4,
|
||||
0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce,
|
||||
0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d,
|
||||
0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8},
|
||||
{0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16,
|
||||
0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37,
|
||||
0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39,
|
||||
0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
|
||||
0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
|
||||
0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
|
||||
0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
|
||||
{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
|
||||
0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
|
||||
0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
|
||||
0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_KECCAK512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21,
|
||||
0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1,
|
||||
0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25,
|
||||
0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97},
|
||||
{0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e,
|
||||
0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a,
|
||||
0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79,
|
||||
0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3,
|
||||
0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb,
|
||||
0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f,
|
||||
0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d},
|
||||
{0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1,
|
||||
0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8,
|
||||
0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b,
|
||||
0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05,
|
||||
0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc,
|
||||
0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf,
|
||||
0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf},
|
||||
{0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1,
|
||||
0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b,
|
||||
0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d,
|
||||
0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_KECCAK256)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5,
|
||||
0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed,
|
||||
0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9,
|
||||
0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10},
|
||||
{0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e,
|
||||
0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b,
|
||||
0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd,
|
||||
0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82,
|
||||
0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85,
|
||||
0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36,
|
||||
0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15},
|
||||
{0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a,
|
||||
0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca,
|
||||
0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a,
|
||||
0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1,
|
||||
0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c,
|
||||
0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee,
|
||||
0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26},
|
||||
{0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14,
|
||||
0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2,
|
||||
0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed,
|
||||
0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80}
|
||||
};
|
||||
#endif
|
||||
#else
|
||||
static const uint8_t post_vectors[][64] = {{0}};
|
||||
#endif
|
||||
|
||||
251
algo/scryptjane/scrypt-jane.c
Normal file
251
algo/scryptjane/scrypt-jane.c
Normal file
@@ -0,0 +1,251 @@
|
||||
#include "miner.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "inttypes.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
/* Hard-coded scrypt parameteres r and p - mikaelh */
|
||||
#define SCRYPT_R 1
|
||||
#define SCRYPT_P 1
|
||||
|
||||
/* Only the instrinsics versions are optimized for hard-coded values - mikaelh */
|
||||
#define CPU_X86_FORCE_INTRINSICS
|
||||
|
||||
#undef SCRYPT_KECCAK512
|
||||
#undef SCRYPT_CHACHA
|
||||
#undef SCRYPT_CHOOSE_COMPILETIME
|
||||
#define SCRYPT_KECCAK512
|
||||
#define SCRYPT_CHACHA
|
||||
#define SCRYPT_CHOOSE_COMPILETIME
|
||||
|
||||
//#include "scrypt-jane.h"
|
||||
#include "../scryptjane/scrypt-jane-portable.h"
|
||||
#include "../scryptjane/scrypt-jane-hash.h"
|
||||
#include "../scryptjane/scrypt-jane-romix.h"
|
||||
#include "../scryptjane/scrypt-jane-test-vectors.h"
|
||||
|
||||
#ifndef min
|
||||
#define min(a,b) (a>b ? b : a)
|
||||
#endif
|
||||
#ifndef max
|
||||
#define max(a,b) (a<b ? b : a)
|
||||
#endif
|
||||
|
||||
#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */
|
||||
#if (SCRYPT_BLOCK_BYTES == 64)
|
||||
#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 128)
|
||||
#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 256)
|
||||
#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 512)
|
||||
#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
|
||||
#endif
|
||||
#define scrypt_maxr scrypt_r_32kb /* 32kb */
|
||||
#define scrypt_maxp 25 /* (1 << 25) = ~33 million */
|
||||
|
||||
typedef struct scrypt_aligned_alloc_t {
|
||||
uint8_t *mem, *ptr;
|
||||
} scrypt_aligned_alloc;
|
||||
|
||||
static int
|
||||
scrypt_alloc(uint64_t size, scrypt_aligned_alloc *aa) {
|
||||
static const size_t max_alloc = (size_t)-1;
|
||||
size += (SCRYPT_BLOCK_BYTES - 1);
|
||||
if (size > max_alloc)
|
||||
return 0; // scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
|
||||
aa->mem = (uint8_t *)malloc((size_t)size);
|
||||
aa->ptr = (uint8_t *)(((size_t)aa->mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
|
||||
if (!aa->mem)
|
||||
return 0; // scrypt_fatal_error("scrypt: out of memory");
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_free(scrypt_aligned_alloc *aa) {
|
||||
free(aa->mem);
|
||||
}
|
||||
|
||||
void
|
||||
scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) {
|
||||
uint32_t chunk_bytes, i;
|
||||
const uint32_t r = SCRYPT_R;
|
||||
const uint32_t p = SCRYPT_P;
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
|
||||
#endif
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
|
||||
/* 1: X = PBKDF2(password, salt) */
|
||||
scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p);
|
||||
|
||||
/* 2: X = ROMix(X) */
|
||||
for (i = 0; i < p; i++)
|
||||
scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);
|
||||
|
||||
/* 3: Out = PBKDF2(password, X) */
|
||||
scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes);
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
/* This is an unnecessary security feature - mikaelh */
|
||||
scrypt_ensure_zero(Y, (p + 1) * chunk_bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// increasing Nfactor gradually
|
||||
const unsigned char minNfactor = 4;
|
||||
const unsigned char maxNfactor = 30;
|
||||
|
||||
unsigned char GetNfactor(unsigned int nTimestamp, unsigned int ntime) {
|
||||
int l = 0;
|
||||
unsigned long int s;
|
||||
int n;
|
||||
unsigned char N;
|
||||
|
||||
if (nTimestamp <= ntime)
|
||||
return 4;
|
||||
|
||||
s = nTimestamp - ntime;
|
||||
while ((s >> 1) > 3) {
|
||||
l += 1;
|
||||
s >>= 1;
|
||||
}
|
||||
|
||||
s &= 3;
|
||||
|
||||
n = (l * 170 + s * 25 - 2320) / 100;
|
||||
|
||||
if (n < 0) n = 0;
|
||||
|
||||
if (n > 255) {
|
||||
n = 255;
|
||||
// printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
|
||||
}
|
||||
|
||||
N = (unsigned char)n;
|
||||
//printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor));
|
||||
|
||||
if (N<minNfactor) return minNfactor;
|
||||
if (N>maxNfactor) return maxNfactor;
|
||||
return N;
|
||||
}
|
||||
|
||||
|
||||
int scanhash_scryptjane( int thr_id, struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done)
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
uint32_t N, chunk_bytes;
|
||||
const uint32_t r = SCRYPT_R;
|
||||
const uint32_t p = SCRYPT_P;
|
||||
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x00ff;
|
||||
|
||||
for (int k = 0; k < 19; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
//Nfactor = GetNfactor(data[17], ntime);
|
||||
//if (Nfactor > scrypt_maxN) {
|
||||
// return 1;
|
||||
// //scrypt_fatal_error("scrypt: N out of range");
|
||||
//}
|
||||
|
||||
N = (1 << ( opt_scrypt_n + 1));
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
if (!scrypt_alloc((uint64_t)N * chunk_bytes, &V)) return 1;
|
||||
if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) {
|
||||
scrypt_free(&V);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
|
||||
do {
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[8];
|
||||
be32enc(&endiandata[19], nonce);
|
||||
|
||||
scrypt_N_1_1((unsigned char *)endiandata, 80,
|
||||
(unsigned char *)endiandata, 80,
|
||||
N, (unsigned char *)hash, 32, X, Y, V.ptr);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* simple cpu test (util.c) */
|
||||
void scryptjanehash(void *output, const void *input )
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
uint32_t chunk_bytes;
|
||||
uint32_t N = (1 << ( opt_scrypt_n + 1));
|
||||
const uint32_t r = SCRYPT_R;
|
||||
const uint32_t p = SCRYPT_P;
|
||||
|
||||
memset(output, 0, 32);
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
if (!scrypt_alloc((uint64_t)N * chunk_bytes, &V)) return;
|
||||
if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) {
|
||||
scrypt_free(&V);
|
||||
return;
|
||||
}
|
||||
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
|
||||
scrypt_N_1_1((unsigned char*)input, 80, (unsigned char*)input, 80,
|
||||
N, (unsigned char*)output, 32, X, Y, V.ptr);
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
}
|
||||
|
||||
void scryptjane_set_target( struct work* work, double job_diff )
|
||||
{
|
||||
work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
|
||||
}
|
||||
|
||||
bool register_scryptjane_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_scryptjane;
|
||||
gate->hash = (void*)&scryptjanehash;
|
||||
gate->hash_alt = (void*)&scryptjanehash;
|
||||
gate->set_target = (void*)&scrypt_set_target;
|
||||
gate->get_max64 = (void*)&get_max64_0x40LL;
|
||||
if ( opt_nfactor == 0 )
|
||||
opt_nfactor = 16;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user