mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.10.1
This commit is contained in:
@@ -144,6 +144,9 @@ Supported Algorithms
|
||||
Errata
|
||||
------
|
||||
|
||||
Old algorithms that are no longer used frequently will not have the latest
|
||||
optimizations.
|
||||
|
||||
Cryptonight and variants are no longer supported, use another miner.
|
||||
|
||||
Neoscrypt crashes on Windows, use legacy version.
|
||||
|
@@ -15,8 +15,8 @@ the features listed at cpuminer startup to ensure you are mining at
|
||||
optimum speed using the best available features.
|
||||
|
||||
Architecture names and compile options used are only provided for Intel
|
||||
Core series. Even the newest Pentium and Celeron CPUs are often missing
|
||||
features.
|
||||
Core series. Budget CPUs like Pentium and Celeron are often missing the
|
||||
latest features.
|
||||
|
||||
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
|
||||
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
@@ -28,7 +28,7 @@ Exe name Compile flags Arch name
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
|
||||
|
||||
|
@@ -31,9 +31,20 @@ FreeBSD YMMV.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.10.1
|
||||
|
||||
AVX512 for blake2b, nist5, quark, tribus.
|
||||
|
||||
More broken lane fixes.
|
||||
|
||||
Fixed buffer overflow in skein AVX512.
|
||||
|
||||
Only the highest ranking feature in a class is listed at startup, lower ranking
|
||||
features are available but no longer listed.
|
||||
|
||||
v3.10.0
|
||||
|
||||
AVX-512 is now supported on selected algos, Windows binary is now available.
|
||||
AVX512 is now supported on selected algos, Windows binary is now available.
|
||||
AVX512 optimizations are available for argon2d, blake2s, keccak, keccakc,
|
||||
skein & skein2.
|
||||
|
||||
@@ -45,7 +56,7 @@ Fixed some previously undetected buffer overflows.
|
||||
|
||||
Lyra2rev2 3% faster SSE2 and AVX2.
|
||||
|
||||
Added "-fno-asynchronous-unwind-tables" to AVX512 build acript for Windows
|
||||
Added "-fno-asynchronous-unwind-tables" to AVX512 build script for Windows
|
||||
to fix known mingw issue.
|
||||
|
||||
Changed AVX2 build script to explicitly add AES to address change in
|
||||
|
@@ -21,7 +21,7 @@
|
||||
|
||||
#include "argon2.h"
|
||||
#include "core.h"
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "../blake2/blake2.h"
|
||||
#include "../blake2/blamka-round-opt.h"
|
||||
|
||||
@@ -37,24 +37,28 @@
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
static void fill_block(__m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor) {
|
||||
static void fill_block( __m512i *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
__m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
|
||||
unsigned int i;
|
||||
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
block_XY[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm512_xor_si512(
|
||||
state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
|
||||
if ( with_xor )
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
block_XY[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)next_block->v + i ) );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
block_XY[i] = state[i] = _mm512_xor_si512( state[i],
|
||||
_mm512_load_si512( (const __m512i*)ref_block->v + i ) );
|
||||
}
|
||||
|
||||
BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
|
||||
state[ 4], state[ 5], state[ 6], state[ 7] );
|
||||
@@ -66,23 +70,10 @@ static void fill_block(__m512i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
|
||||
state[ 9], state[11], state[13], state[15] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_1(
|
||||
state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
|
||||
state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
BLAKE2_ROUND_2(
|
||||
state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
|
||||
state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm512_xor_si512(state[i], block_XY[i]);
|
||||
_mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
|
||||
for ( i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm512_xor_si512( state[i], block_XY[i] );
|
||||
_mm512_store_si512( (__m512i*)next_block->v + i, state[i] );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,18 +116,6 @@ static void fill_block(__m256i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
|
||||
state[19], state[23], state[27], state[31] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
|
||||
state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
|
||||
state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
|
||||
_mm256_store_si256((__m256i *)next_block->v + i, state[i]);
|
||||
@@ -153,14 +132,14 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
block_XY[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)next_block->v + i));
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm_xor_si128(
|
||||
state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
|
||||
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,22 +177,9 @@ static void fill_block(__m128i *state, const block *ref_block,
|
||||
BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],
|
||||
state[39], state[47], state[55], state[63] );
|
||||
|
||||
/*
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
|
||||
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
|
||||
state[8 * i + 6], state[8 * i + 7]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
|
||||
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
|
||||
state[8 * 6 + i], state[8 * 7 + i]);
|
||||
}
|
||||
*/
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(state[i], block_XY[i]);
|
||||
_mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
|
||||
_mm_store_si128((__m128i *)next_block->v + i, state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -427,14 +427,14 @@ static __m512i muladd(__m512i x, __m512i y)
|
||||
#define SWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
} while((void)0, 0)
|
||||
|
||||
#define UNSWAP_QUARTERS(A0, A1) \
|
||||
do { \
|
||||
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
||||
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
||||
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
|
||||
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
|
||||
SWAP_HALVES(A0, A1); \
|
||||
} while((void)0, 0)
|
||||
|
||||
|
@@ -118,20 +118,42 @@ void blake256r8_8way_close(void *cc, void *dst);
|
||||
// Blake-512 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_4way_big_context;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init(void *cc);
|
||||
void blake512_4way(void *cc, const void *data, size_t len);
|
||||
void blake512_4way_close(void *cc, void *dst);
|
||||
void blake512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
void blake512_4way_init( void *cc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
#define blake512_4way blake512_4way_update
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( void *cc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
|
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
|
@@ -54,7 +54,6 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
|
||||
// Blake-512
|
||||
|
||||
static const sph_u64 IV512[8] = {
|
||||
@@ -64,6 +63,7 @@ static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
||||
|
||||
@@ -264,8 +264,6 @@ static const unsigned sigma[16][16] = {
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define CBx(r, i) CBx_(Z ## r ## i)
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
@@ -287,6 +285,7 @@ static const unsigned sigma[16][16] = {
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
/*
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
static const sph_u64 CB[16] = {
|
||||
@@ -301,7 +300,301 @@ static const sph_u64 CB[16] = {
|
||||
};
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#define READ_STATE64(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE64(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake-512 8 way
|
||||
|
||||
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c1 ), m0 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 25 ); \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
_mm512_set1_epi64( c0 ), m1 ), b ), a ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_B_8WAY(r) do { \
|
||||
GB_8WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_8WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB_8WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB_8WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB_8WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB_8WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB_8WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m512i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define COMPRESS64_8WAY do \
|
||||
{ \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m512i shuf_bswap64; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm512_xor_si512( S0, m512_const1_64( CB0 ) ); \
|
||||
V9 = _mm512_xor_si512( S1, m512_const1_64( CB1 ) ); \
|
||||
VA = _mm512_xor_si512( S2, m512_const1_64( CB2 ) ); \
|
||||
VB = _mm512_xor_si512( S3, m512_const1_64( CB3 ) ); \
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB4 ) ); \
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( T0 ), \
|
||||
m512_const1_64( CB5 ) ); \
|
||||
VE = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
||||
m512_const1_64( CB6 ) ); \
|
||||
VF = _mm512_xor_si512( _mm512_set1_epi64( T1 ), \
|
||||
m512_const1_64( CB7 ) ); \
|
||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
|
||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
|
||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
|
||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
|
||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
|
||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
|
||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
|
||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
|
||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
|
||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
|
||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
|
||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
|
||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
|
||||
ROUND_B_8WAY(0); \
|
||||
ROUND_B_8WAY(1); \
|
||||
ROUND_B_8WAY(2); \
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
ROUND_B_8WAY(6); \
|
||||
ROUND_B_8WAY(7); \
|
||||
ROUND_B_8WAY(8); \
|
||||
ROUND_B_8WAY(9); \
|
||||
ROUND_B_8WAY(0); \
|
||||
ROUND_B_8WAY(1); \
|
||||
ROUND_B_8WAY(2); \
|
||||
ROUND_B_8WAY(3); \
|
||||
ROUND_B_8WAY(4); \
|
||||
ROUND_B_8WAY(5); \
|
||||
H0 = mm512_xor4( V8, V0, S0, H0 ); \
|
||||
H1 = mm512_xor4( V9, V1, S1, H1 ); \
|
||||
H2 = mm512_xor4( VA, V2, S2, H2 ); \
|
||||
H3 = mm512_xor4( VB, V3, S3, H3 ); \
|
||||
H4 = mm512_xor4( VC, V4, S0, H4 ); \
|
||||
H5 = mm512_xor4( VD, V5, S1, H5 ); \
|
||||
H6 = mm512_xor4( VE, V6, S2, H6 ); \
|
||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
blake64_8way_init( blake_8way_big_context *sc, const sph_u64 *iv,
|
||||
const sph_u64 *salt )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = zero;
|
||||
casti_m512i( sc->S, 1 ) = zero;
|
||||
casti_m512i( sc->S, 2 ) = zero;
|
||||
casti_m512i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
size_t ptr;
|
||||
DECL_STATE64_8WAY
|
||||
|
||||
const int buf_size = 128; // sizeof/8
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_8WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way_close( blake_8way_big_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
||||
{
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>3] = _mm512_set1_epi64( zz );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[(104>>3)] = _mm512_or_si512( buf[(104>>3)],
|
||||
m512_const1_64( 0x0100000000000000ULL ) );
|
||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf, 128 );
|
||||
}
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_init(void *cc)
|
||||
{
|
||||
blake64_8way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_8way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
blake512_8way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_8way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
@@ -318,29 +611,6 @@ static const sph_u64 CB[16] = {
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
|
||||
GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
|
||||
CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
|
||||
GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
|
||||
CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
|
||||
GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
|
||||
CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
|
||||
GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
|
||||
CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
|
||||
GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
|
||||
CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
|
||||
GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
|
||||
CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
|
||||
GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
|
||||
CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
//current_impl
|
||||
#define ROUND_B_4WAY(r) do { \
|
||||
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
@@ -352,120 +622,11 @@ static const sph_u64 CB[16] = {
|
||||
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// Blake-512 4 way
|
||||
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define READ_STATE64_4WAY(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
S0 = (state)->S[0]; \
|
||||
S1 = (state)->S[1]; \
|
||||
S2 = (state)->S[2]; \
|
||||
S3 = (state)->S[3]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE64_4WAY(state) do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->S[0] = S0; \
|
||||
(state)->S[1] = S1; \
|
||||
(state)->S[2] = S2; \
|
||||
(state)->S[3] = S3; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
|
||||
// not used
|
||||
#define COMPRESS64_4WAY do { \
|
||||
__m256i M[16]; \
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
|
||||
V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
|
||||
VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
|
||||
VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
_mm256_set1_epi64x( CB4 ) ); \
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
|
||||
_mm256_set1_epi64x( CB5 ) ); \
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
_mm256_set1_epi64x( CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
_mm256_set1_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = _mm256_shuffle_epi8( *(buf+ 0), shuff_bswap64 ); \
|
||||
M[0x1] = _mm256_shuffle_epi8( *(buf+ 1), shuff_bswap64 ); \
|
||||
M[0x2] = _mm256_shuffle_epi8( *(buf+ 2), shuff_bswap64 ); \
|
||||
M[0x3] = _mm256_shuffle_epi8( *(buf+ 3), shuff_bswap64 ); \
|
||||
M[0x4] = _mm256_shuffle_epi8( *(buf+ 4), shuff_bswap64 ); \
|
||||
M[0x5] = _mm256_shuffle_epi8( *(buf+ 5), shuff_bswap64 ); \
|
||||
M[0x6] = _mm256_shuffle_epi8( *(buf+ 6), shuff_bswap64 ); \
|
||||
M[0x7] = _mm256_shuffle_epi8( *(buf+ 7), shuff_bswap64 ); \
|
||||
M[0x8] = _mm256_shuffle_epi8( *(buf+ 8), shuff_bswap64 ); \
|
||||
M[0x9] = _mm256_shuffle_epi8( *(buf+ 9), shuff_bswap64 ); \
|
||||
M[0xA] = _mm256_shuffle_epi8( *(buf+10), shuff_bswap64 ); \
|
||||
M[0xB] = _mm256_shuffle_epi8( *(buf+11), shuff_bswap64 ); \
|
||||
M[0xC] = _mm256_shuffle_epi8( *(buf+12), shuff_bswap64 ); \
|
||||
M[0xD] = _mm256_shuffle_epi8( *(buf+13), shuff_bswap64 ); \
|
||||
M[0xE] = _mm256_shuffle_epi8( *(buf+14), shuff_bswap64 ); \
|
||||
M[0xF] = _mm256_shuffle_epi8( *(buf+15), shuff_bswap64 ); \
|
||||
for (r = 0; r < 16; r ++) \
|
||||
ROUND_B_4WAY(r); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
//current impl
|
||||
|
||||
#define COMPRESS64_4WAY do \
|
||||
{ \
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
@@ -493,7 +654,8 @@ static const sph_u64 CB[16] = {
|
||||
m256_const1_64( CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
|
||||
m256_const1_64( CB7 ) ); \
|
||||
shuf_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
|
||||
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
|
||||
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
|
||||
@@ -536,9 +698,7 @@ static const sph_u64 CB[16] = {
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
//static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
@@ -583,7 +743,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64_4WAY(sc);
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
@@ -603,7 +763,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64_4WAY(sc);
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
@@ -674,7 +834,7 @@ blake512_4way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
blake512_4way(void *cc, const void *data, size_t len)
|
||||
blake512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake64_4way(cc, data, len);
|
||||
}
|
||||
|
@@ -107,7 +107,8 @@ typedef struct {
|
||||
typedef bmw_2way_big_context bmw512_2way_context;
|
||||
|
||||
void bmw512_2way_init( bmw512_2way_context *ctx );
|
||||
void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
|
||||
void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
|
||||
|
||||
#endif // __SSE2__
|
||||
@@ -128,7 +129,8 @@ typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
void bmw512_4way(void *cc, const void *data, size_t len);
|
||||
void bmw512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw512_4way bmw512_4way_update
|
||||
|
||||
void bmw512_4way_close(void *cc, void *dst);
|
||||
|
||||
|
@@ -561,13 +561,10 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-512 4 way 64
|
||||
|
||||
|
||||
#define sb0(x) \
|
||||
mm256_xor4( _mm256_srli_epi64( (x), 1), _mm256_slli_epi64( (x), 3), \
|
||||
mm256_rol_64( (x), 4), mm256_rol_64( (x),37) )
|
||||
@@ -1047,7 +1044,7 @@ bmw512_4way_init(void *cc)
|
||||
}
|
||||
|
||||
void
|
||||
bmw512_4way(void *cc, const void *data, size_t len)
|
||||
bmw512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
bmw64_4way(cc, data, len);
|
||||
}
|
||||
@@ -1137,8 +1134,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
s8b4( qt[ (i)- 2 ] ), s8b5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_b8( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
|
||||
#define W8b0 \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_add_epi64( \
|
||||
@@ -1328,21 +1323,28 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
#define DH1( m, sl, sr, a, b, c ) \
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xh, sl ), \
|
||||
_mm512_srli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DHL( m, rl, sl, h, a, b, c ) \
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi64( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi64( xh, sl ), \
|
||||
_mm512_slli_epi64( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi64( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
|
||||
#define DHR( m, rl, sr, h, a, b, c ) \
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi64( _mm512_add_epi64( \
|
||||
mm512_rol_64( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
@@ -1350,26 +1352,27 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
|
||||
|
||||
dH[ 0] = DH1( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1( 1, 7, 8, 17, 25, 1 );
|
||||
dH[ 2] = DH1( 2, 5, 5, 18, 26, 2 );
|
||||
dH[ 3] = DH1( 3, 1, 5, 19, 27, 3 );
|
||||
dH[ 4] = DH1( 4, 3, 0, 20, 28, 4 );
|
||||
dH[ 5] = DH1( 5, 6, 6, 21, 29, 5 );
|
||||
dH[ 6] = DH1( 6, 4, 6, 22, 30, 6 );
|
||||
dH[ 7] = DH1( 7, 11, 2, 23, 31, 7 );
|
||||
dH[ 8] = DHL( 8, 9, 8, 4, 24, 23, 8 );
|
||||
dH[ 9] = DHR( 9, 10, 6, 5, 25, 16, 9 );
|
||||
dH[10] = DHL( 10, 11, 6, 6, 26, 17, 10 );
|
||||
dH[11] = DHL( 11, 12, 4, 7, 27, 18, 11 );
|
||||
dH[12] = DHR( 12, 13, 3, 0, 28, 19, 12 );
|
||||
dH[13] = DHR( 13, 14, 4, 1, 29, 20, 13 );
|
||||
dH[14] = DHR( 14, 15, 7, 2, 30, 21, 14 );
|
||||
dH[15] = DHR( 15, 16, 2, 3, 31, 22, 15 );
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||
|
||||
#undef DH1
|
||||
#undef DHL
|
||||
#undef DHR
|
||||
#undef DH1L
|
||||
#undef DH1R
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
|
||||
}
|
||||
|
||||
|
@@ -26,6 +26,180 @@ static const uint64_t IV512[] =
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
static void transform_4way( cube_4way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm512_load_si512( (__m512i*)sp->h );
|
||||
x1 = _mm512_load_si512( (__m512i*)sp->h + 1 );
|
||||
x2 = _mm512_load_si512( (__m512i*)sp->h + 2 );
|
||||
x3 = _mm512_load_si512( (__m512i*)sp->h + 3 );
|
||||
x4 = _mm512_load_si512( (__m512i*)sp->h + 4 );
|
||||
x5 = _mm512_load_si512( (__m512i*)sp->h + 5 );
|
||||
x6 = _mm512_load_si512( (__m512i*)sp->h + 6 );
|
||||
x7 = _mm512_load_si512( (__m512i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm512_rol_32( x2, 7 );
|
||||
x1 = mm512_rol_32( x3, 7 );
|
||||
x2 = mm512_rol_32( y0, 7 );
|
||||
x3 = mm512_rol_32( y1, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap64_128( x4 );
|
||||
x5 = mm512_swap64_128( x5 );
|
||||
x6 = mm512_swap64_128( x6 );
|
||||
x7 = mm512_swap64_128( x7 );
|
||||
x4 = _mm512_add_epi32( x0, x4 );
|
||||
x5 = _mm512_add_epi32( x1, x5 );
|
||||
x6 = _mm512_add_epi32( x2, x6 );
|
||||
x7 = _mm512_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm512_rol_32( x1, 11 );
|
||||
x1 = mm512_rol_32( y0, 11 );
|
||||
x2 = mm512_rol_32( x3, 11 );
|
||||
x3 = mm512_rol_32( y1, 11 );
|
||||
x0 = _mm512_xor_si512( x0, x4 );
|
||||
x1 = _mm512_xor_si512( x1, x5 );
|
||||
x2 = _mm512_xor_si512( x2, x6 );
|
||||
x3 = _mm512_xor_si512( x3, x7 );
|
||||
x4 = mm512_swap32_64( x4 );
|
||||
x5 = mm512_swap32_64( x5 );
|
||||
x6 = mm512_swap32_64( x6 );
|
||||
x7 = mm512_swap32_64( x7 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)sp->h, x0 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 1, x1 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 2, x2 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 3, x3 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 4, x4 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 5, x5 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 6, x6 );
|
||||
_mm512_store_si512( (__m512i*)sp->h + 7, x7 );
|
||||
}
|
||||
|
||||
int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m512i *h = (__m512i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output )
|
||||
{
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
@@ -91,7 +265,6 @@ static void transform_2way( cube_2way_context *sp )
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
@@ -132,9 +305,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
|
203
algo/cubehash/cube-hash-2way.c.save
Normal file
203
algo/cubehash/cube-hash-2way.c.save
Normal file
@@ -0,0 +1,203 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include "cube-hash-2way.h"
|
||||
|
||||
// 2x128
|
||||
|
||||
|
||||
// The result of hashing 10 rounds of initial data which consists of params
|
||||
// zero padded.
|
||||
static const uint64_t IV256[] =
|
||||
{
|
||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
||||
};
|
||||
|
||||
static const uint64_t IV512[] =
|
||||
{
|
||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm256_load_si256( (__m256i*)sp->h );
|
||||
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
|
||||
x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
|
||||
x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
|
||||
x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
|
||||
x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
|
||||
x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
|
||||
x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm256_rol_32( x2, 7 );
|
||||
x1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( y0, 7 );
|
||||
x3 = mm256_rol_32( y1, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap64_128( x4 );
|
||||
x5 = mm256_swap64_128( x5 );
|
||||
x6 = mm256_swap64_128( x6 );
|
||||
x7 = mm256_swap64_128( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( y0, 11 );
|
||||
x2 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap32_64( x4 );
|
||||
x5 = mm256_swap32_64( x5 );
|
||||
x6 = mm256_swap32_64( x6 );
|
||||
x7 = mm256_swap32_64( x7 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 1, x1 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 2, x2 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 3, x3 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 4, x4 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
{
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,11 +1,38 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define CUBE_HASH_2WAY_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
struct _cube_4way_context
|
||||
{
|
||||
__m512i h[8];
|
||||
int hashlen;
|
||||
int rounds;
|
||||
int blocksize;
|
||||
int pos;
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_4way_reinit( cube_4way_context *sp );
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
#endif
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
struct _cube_2way_context
|
||||
@@ -15,7 +42,7 @@ struct _cube_2way_context
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
} __attribute__ ((aligned (64)));
|
||||
} __attribute__ ((aligned (128)));
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
|
36
algo/cubehash/cube-hash-2way.h.save
Normal file
36
algo/cubehash/cube-hash-2way.h.save
Normal file
@@ -0,0 +1,36 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
struct _cube_2way_context
|
||||
{
|
||||
__m256i h[8];
|
||||
int hashlen; // __m128i
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_2way_reinit( cube_2way_context *sp );
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output );
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -92,6 +92,38 @@ extern "C"{
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m512i cc = _mm512_set1_epi64( c ); \
|
||||
x3 = mm512_not( x3 ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_andnot_si512( x2, cc ) ); \
|
||||
tmp = _mm512_xor_si512( cc, _mm512_and_si512( x0, x1 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_and_si512( x2, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_andnot_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( x0, x2 ) ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_andnot_si512( x3, x0 ) ); \
|
||||
x0 = _mm512_xor_si512( x0, _mm512_or_si512( x1, x3 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, _mm512_and_si512( x1, x2 ) ); \
|
||||
x1 = _mm512_xor_si512( x1, _mm512_and_si512( tmp, x0 ) ); \
|
||||
x2 = _mm512_xor_si512( x2, tmp ); \
|
||||
} while (0)
|
||||
|
||||
#define Lb_8W(x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
do { \
|
||||
x4 = _mm512_xor_si512( x4, x1 ); \
|
||||
x5 = _mm512_xor_si512( x5, x2 ); \
|
||||
x6 = _mm512_xor_si512( x6, _mm512_xor_si512( x3, x0 ) ); \
|
||||
x7 = _mm512_xor_si512( x7, x0 ); \
|
||||
x0 = _mm512_xor_si512( x0, x5 ); \
|
||||
x1 = _mm512_xor_si512( x1, x6 ); \
|
||||
x2 = _mm512_xor_si512( x2, _mm512_xor_si512( x7, x4 ) ); \
|
||||
x3 = _mm512_xor_si512( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
__m256i cc = _mm256_set1_epi64x( c ); \
|
||||
@@ -226,6 +258,48 @@ static const sph_u64 C[] = {
|
||||
x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define S_8W(x0, x1, x2, x3, cb, r) do { \
|
||||
Sb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
|
||||
Sb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
|
||||
} while (0)
|
||||
|
||||
#define L_8W(x0, x1, x2, x3, x4, x5, x6, x7) do { \
|
||||
Lb_8W(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
|
||||
x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
|
||||
Lb_8W(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
|
||||
x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
|
||||
} while (0)
|
||||
|
||||
#define Wz_8W(x, c, n) \
|
||||
do { \
|
||||
__m512i t = _mm512_slli_epi64( _mm512_and_si512(x ## h, (c)), (n) ); \
|
||||
x ## h = _mm512_or_si512( _mm512_and_si512( \
|
||||
_mm512_srli_epi64(x ## h, (n)), (c)), t ); \
|
||||
t = _mm512_slli_epi64( _mm512_and_si512(x ## l, (c)), (n) ); \
|
||||
x ## l = _mm512_or_si512( _mm512_and_si512((x ## l >> (n)), (c)), t ); \
|
||||
} while (0)
|
||||
|
||||
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
#define W83(x) Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ), 8 )
|
||||
#define W84(x) Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
|
||||
#define W85(x) Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
|
||||
#define W86(x) \
|
||||
do { \
|
||||
__m512i t = x ## h; \
|
||||
x ## h = x ## l; \
|
||||
x ## l = t; \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE_8W \
|
||||
__m512i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
|
||||
__m512i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
|
||||
__m512i tmp;
|
||||
|
||||
#endif
|
||||
|
||||
#define Wz(x, c, n) \
|
||||
do { \
|
||||
@@ -236,16 +310,6 @@ do { \
|
||||
x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/*
|
||||
#define Wz(x, c, n) do { \
|
||||
sph_u64 t = (x ## h & (c)) << (n); \
|
||||
x ## h = ((x ## h >> (n)) & (c)) | t; \
|
||||
t = (x ## l & (c)) << (n); \
|
||||
x ## l = ((x ## l >> (n)) & (c)) | t; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define W0(x) Wz(x, m256_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W1(x) Wz(x, m256_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W2(x) Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
@@ -259,25 +323,12 @@ do { \
|
||||
x ## l = t; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1)
|
||||
#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2)
|
||||
#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4)
|
||||
#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8)
|
||||
#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
|
||||
#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
|
||||
#define W6(x) do { \
|
||||
sph_u64 t = x ## h; \
|
||||
x ## h = x ## l; \
|
||||
x ## l = t; \
|
||||
} while (0)
|
||||
*/
|
||||
|
||||
#define DECL_STATE \
|
||||
__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
|
||||
__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
|
||||
__m256i tmp;
|
||||
|
||||
|
||||
#define READ_STATE(state) do { \
|
||||
h0h = (state)->H[ 0]; \
|
||||
h0l = (state)->H[ 1]; \
|
||||
@@ -316,6 +367,38 @@ do { \
|
||||
(state)->H[15] = h7l; \
|
||||
} while (0)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define INPUT_BUF1_8W \
|
||||
__m512i m0h = buf[0]; \
|
||||
__m512i m0l = buf[1]; \
|
||||
__m512i m1h = buf[2]; \
|
||||
__m512i m1l = buf[3]; \
|
||||
__m512i m2h = buf[4]; \
|
||||
__m512i m2l = buf[5]; \
|
||||
__m512i m3h = buf[6]; \
|
||||
__m512i m3l = buf[7]; \
|
||||
h0h = _mm512_xor_si512( h0h, m0h ); \
|
||||
h0l = _mm512_xor_si512( h0l, m0l ); \
|
||||
h1h = _mm512_xor_si512( h1h, m1h ); \
|
||||
h1l = _mm512_xor_si512( h1l, m1l ); \
|
||||
h2h = _mm512_xor_si512( h2h, m2h ); \
|
||||
h2l = _mm512_xor_si512( h2l, m2l ); \
|
||||
h3h = _mm512_xor_si512( h3h, m3h ); \
|
||||
h3l = _mm512_xor_si512( h3l, m3l ); \
|
||||
|
||||
#define INPUT_BUF2_8W \
|
||||
h4h = _mm512_xor_si512( h4h, m0h ); \
|
||||
h4l = _mm512_xor_si512( h4l, m0l ); \
|
||||
h5h = _mm512_xor_si512( h5h, m1h ); \
|
||||
h5l = _mm512_xor_si512( h5l, m1l ); \
|
||||
h6h = _mm512_xor_si512( h6h, m2h ); \
|
||||
h6l = _mm512_xor_si512( h6l, m2l ); \
|
||||
h7h = _mm512_xor_si512( h7h, m3h ); \
|
||||
h7l = _mm512_xor_si512( h7l, m3l ); \
|
||||
|
||||
#endif
|
||||
|
||||
#define INPUT_BUF1 \
|
||||
__m256i m0h = buf[0]; \
|
||||
__m256i m0l = buf[1]; \
|
||||
@@ -344,6 +427,7 @@ do { \
|
||||
h7h = _mm256_xor_si256( h7h, m3h ); \
|
||||
h7l = _mm256_xor_si256( h7l, m3l ); \
|
||||
|
||||
|
||||
static const sph_u64 IV256[] = {
|
||||
C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
|
||||
C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
|
||||
@@ -370,6 +454,22 @@ static const sph_u64 IV512[] = {
|
||||
#else
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define SL_8W(ro) SLu_8W(r + ro, ro)
|
||||
|
||||
#define SLu_8W(r, ro) do { \
|
||||
S_8W(h0, h2, h4, h6, Ceven_, r); \
|
||||
S_8W(h1, h3, h5, h7, Codd_, r); \
|
||||
L_8W(h0, h2, h4, h6, h1, h3, h5, h7); \
|
||||
W8 ## ro(h1); \
|
||||
W8 ## ro(h3); \
|
||||
W8 ## ro(h5); \
|
||||
W8 ## ro(h7); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define SL(ro) SLu(r + ro, ro)
|
||||
@@ -393,6 +493,23 @@ static const sph_u64 IV512[] = {
|
||||
* loop.
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define E8_8W do { \
|
||||
unsigned r; \
|
||||
for (r = 0; r < 42; r += 7) { \
|
||||
SL_8W(0); \
|
||||
SL_8W(1); \
|
||||
SL_8W(2); \
|
||||
SL_8W(3); \
|
||||
SL_8W(4); \
|
||||
SL_8W(5); \
|
||||
SL_8W(6); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define E8 do { \
|
||||
unsigned r; \
|
||||
for (r = 0; r < 42; r += 7) { \
|
||||
@@ -419,51 +536,100 @@ static const sph_u64 IV512[] = {
|
||||
* On a "true 64-bit" architecture, we can unroll at will.
|
||||
*/
|
||||
|
||||
#define E8 do { \
|
||||
SLu( 0, 0); \
|
||||
SLu( 1, 1); \
|
||||
SLu( 2, 2); \
|
||||
SLu( 3, 3); \
|
||||
SLu( 4, 4); \
|
||||
SLu( 5, 5); \
|
||||
SLu( 6, 6); \
|
||||
SLu( 7, 0); \
|
||||
SLu( 8, 1); \
|
||||
SLu( 9, 2); \
|
||||
SLu(10, 3); \
|
||||
SLu(11, 4); \
|
||||
SLu(12, 5); \
|
||||
SLu(13, 6); \
|
||||
SLu(14, 0); \
|
||||
SLu(15, 1); \
|
||||
SLu(16, 2); \
|
||||
SLu(17, 3); \
|
||||
SLu(18, 4); \
|
||||
SLu(19, 5); \
|
||||
SLu(20, 6); \
|
||||
SLu(21, 0); \
|
||||
SLu(22, 1); \
|
||||
SLu(23, 2); \
|
||||
SLu(24, 3); \
|
||||
SLu(25, 4); \
|
||||
SLu(26, 5); \
|
||||
SLu(27, 6); \
|
||||
SLu(28, 0); \
|
||||
SLu(29, 1); \
|
||||
SLu(30, 2); \
|
||||
SLu(31, 3); \
|
||||
SLu(32, 4); \
|
||||
SLu(33, 5); \
|
||||
SLu(34, 6); \
|
||||
SLu(35, 0); \
|
||||
SLu(36, 1); \
|
||||
SLu(37, 2); \
|
||||
SLu(38, 3); \
|
||||
SLu(39, 4); \
|
||||
SLu(40, 5); \
|
||||
SLu(41, 6); \
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define E8_8W do { \
|
||||
SLu_8W( 0, 0); \
|
||||
SLu_8W( 1, 1); \
|
||||
SLu_8W( 2, 2); \
|
||||
SLu_8W( 3, 3); \
|
||||
SLu_8W( 4, 4); \
|
||||
SLu_8W( 5, 5); \
|
||||
SLu_8W( 6, 6); \
|
||||
SLu_8W( 7, 0); \
|
||||
SLu_8W( 8, 1); \
|
||||
SLu_8W( 9, 2); \
|
||||
SLu_8W(10, 3); \
|
||||
SLu_8W(11, 4); \
|
||||
SLu_8W(12, 5); \
|
||||
SLu_8W(13, 6); \
|
||||
SLu_8W(14, 0); \
|
||||
SLu_8W(15, 1); \
|
||||
SLu_8W(16, 2); \
|
||||
SLu_8W(17, 3); \
|
||||
SLu_8W(18, 4); \
|
||||
SLu_8W(19, 5); \
|
||||
SLu_8W(20, 6); \
|
||||
SLu_8W(21, 0); \
|
||||
SLu_8W(22, 1); \
|
||||
SLu_8W(23, 2); \
|
||||
SLu_8W(24, 3); \
|
||||
SLu_8W(25, 4); \
|
||||
SLu_8W(26, 5); \
|
||||
SLu_8W(27, 6); \
|
||||
SLu_8W(28, 0); \
|
||||
SLu_8W(29, 1); \
|
||||
SLu_8W(30, 2); \
|
||||
SLu_8W(31, 3); \
|
||||
SLu_8W(32, 4); \
|
||||
SLu_8W(33, 5); \
|
||||
SLu_8W(34, 6); \
|
||||
SLu_8W(35, 0); \
|
||||
SLu_8W(36, 1); \
|
||||
SLu_8W(37, 2); \
|
||||
SLu_8W(38, 3); \
|
||||
SLu_8W(39, 4); \
|
||||
SLu_8W(40, 5); \
|
||||
SLu_8W(41, 6); \
|
||||
} while (0)
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#define E8 do { \
|
||||
SLu( 0, 0); \
|
||||
SLu( 1, 1); \
|
||||
SLu( 2, 2); \
|
||||
SLu( 3, 3); \
|
||||
SLu( 4, 4); \
|
||||
SLu( 5, 5); \
|
||||
SLu( 6, 6); \
|
||||
SLu( 7, 0); \
|
||||
SLu( 8, 1); \
|
||||
SLu( 9, 2); \
|
||||
SLu(10, 3); \
|
||||
SLu(11, 4); \
|
||||
SLu(12, 5); \
|
||||
SLu(13, 6); \
|
||||
SLu(14, 0); \
|
||||
SLu(15, 1); \
|
||||
SLu(16, 2); \
|
||||
SLu(17, 3); \
|
||||
SLu(18, 4); \
|
||||
SLu(19, 5); \
|
||||
SLu(20, 6); \
|
||||
SLu(21, 0); \
|
||||
SLu(22, 1); \
|
||||
SLu(23, 2); \
|
||||
SLu(24, 3); \
|
||||
SLu(25, 4); \
|
||||
SLu(26, 5); \
|
||||
SLu(27, 6); \
|
||||
SLu(28, 0); \
|
||||
SLu(29, 1); \
|
||||
SLu(30, 2); \
|
||||
SLu(31, 3); \
|
||||
SLu(32, 4); \
|
||||
SLu(33, 5); \
|
||||
SLu(34, 6); \
|
||||
SLu(35, 0); \
|
||||
SLu(36, 1); \
|
||||
SLu(37, 2); \
|
||||
SLu(38, 3); \
|
||||
SLu(39, 4); \
|
||||
SLu(40, 5); \
|
||||
SLu(41, 6); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -471,6 +637,158 @@ static const sph_u64 IV512[] = {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
void jh256_8way_init( jh_8way_context *sc )
|
||||
{
|
||||
// bswapped IV256
|
||||
sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
|
||||
sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
|
||||
sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
|
||||
sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
|
||||
sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
|
||||
sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
|
||||
sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
|
||||
sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
|
||||
sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
|
||||
sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
|
||||
sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
|
||||
sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
|
||||
sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
|
||||
sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
|
||||
sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
|
||||
sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
|
||||
void jh512_8way_init( jh_8way_context *sc )
|
||||
{
|
||||
// bswapped IV512
|
||||
sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
|
||||
sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
|
||||
sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
|
||||
sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
|
||||
sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
|
||||
sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
|
||||
sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
|
||||
sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
|
||||
sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
|
||||
sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
|
||||
sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
|
||||
sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
|
||||
sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
|
||||
sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
|
||||
sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
|
||||
sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
jh_8way_core( jh_8way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *buf;
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
const int buf_size = 64; // 64 * _m512i
|
||||
size_t ptr;
|
||||
DECL_STATE_8W
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
INPUT_BUF1_8W;
|
||||
E8_8W;
|
||||
INPUT_BUF2_8W;
|
||||
sc->block_count ++;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
size_t out_size_w32, const void *iv )
|
||||
{
|
||||
__m512i buf[16*4];
|
||||
__m512i *dst512 = (__m512i*)dst;
|
||||
size_t numz, u;
|
||||
sph_u64 l0, l1, l0e, l1e;
|
||||
|
||||
buf[0] = m512_const1_64( 0x80ULL );
|
||||
|
||||
if ( sc->ptr == 0 )
|
||||
numz = 48;
|
||||
else
|
||||
numz = 112 - sc->ptr;
|
||||
|
||||
memset_zero_512( buf+1, (numz>>3) - 1 );
|
||||
|
||||
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
|
||||
l1 = SPH_T64(sc->block_count >> 55);
|
||||
sph_enc64be( &l0e, l0 );
|
||||
sph_enc64be( &l1e, l1 );
|
||||
*(buf + (numz>>3) ) = _mm512_set1_epi64( l1e );
|
||||
*(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
|
||||
|
||||
jh_8way_core( sc, buf, numz + 16 );
|
||||
|
||||
for ( u=0; u < 8; u++ )
|
||||
buf[u] = sc->H[u+8];
|
||||
|
||||
memcpy_512( dst512, buf, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
jh256_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_8way_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
jh256_8way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_8way_close(cc, 0, 0, dst, 8, IV256);
|
||||
}
|
||||
|
||||
void
|
||||
jh512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_8way_core(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
jh512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_8way_close(cc, 0, 0, dst, 16, IV512);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void jh256_4way_init( jh_4way_context *sc )
|
||||
{
|
||||
// bswapped IV256
|
||||
@@ -595,16 +913,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
memcpy_256( dst256, buf, 8 );
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
jh256_4way_init(void *cc)
|
||||
{
|
||||
jhs_4way_init(cc, IV256);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
jh256_4way(void *cc, const void *data, size_t len)
|
||||
jh256_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_4way_core(cc, data, len);
|
||||
}
|
||||
@@ -615,16 +925,8 @@ jh256_4way_close(void *cc, void *dst)
|
||||
jh_4way_close(cc, 0, 0, dst, 8, IV256);
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
jh512_4way_init(void *cc)
|
||||
{
|
||||
jhb_4way_init(cc, IV512);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
jh512_4way(void *cc, const void *data, size_t len)
|
||||
jh512_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
jh_4way_core(cc, data, len);
|
||||
}
|
||||
@@ -635,6 +937,7 @@ jh512_4way_close(void *cc, void *dst)
|
||||
jh_4way_close(cc, 0, 0, dst, 16, IV512);
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -60,20 +60,41 @@ extern "C"{
|
||||
* can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[8] __attribute__ ((aligned (64)));
|
||||
__m512i buf[8];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint64_t block_count;
|
||||
} jh_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef jh_8way_context jh256_8way_context;
|
||||
|
||||
typedef jh_8way_context jh512_8way_context;
|
||||
|
||||
void jh256_8way_init( jh_8way_context *sc);
|
||||
|
||||
void jh256_8way_update(void *cc, const void *data, size_t len);
|
||||
|
||||
void jh256_8way_close(void *cc, void *dst);
|
||||
|
||||
void jh512_8way_init( jh_8way_context *sc );
|
||||
|
||||
void jh512_8way_update(void *cc, const void *data, size_t len);
|
||||
|
||||
void jh512_8way_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[8];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint64_t block_count;
|
||||
/*
|
||||
unsigned char buf[64];
|
||||
size_t ptr;
|
||||
union {
|
||||
sph_u64 wide[16];
|
||||
} H;
|
||||
sph_u64 block_count;
|
||||
*/
|
||||
} jh_4way_context;
|
||||
} jh_4way_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef jh_4way_context jh256_4way_context;
|
||||
|
||||
@@ -81,13 +102,15 @@ typedef jh_4way_context jh512_4way_context;
|
||||
|
||||
void jh256_4way_init( jh_4way_context *sc);
|
||||
|
||||
void jh256_4way(void *cc, const void *data, size_t len);
|
||||
void jh256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define jh256_4way jh256_4way_update
|
||||
|
||||
void jh256_4way_close(void *cc, void *dst);
|
||||
|
||||
void jh512_4way_init( jh_4way_context *sc );
|
||||
|
||||
void jh512_4way(void *cc, const void *data, size_t len);
|
||||
void jh512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define jh512_4way jh512_4way_update
|
||||
|
||||
void jh512_4way_close(void *cc, void *dst);
|
||||
|
||||
@@ -95,6 +118,6 @@ void jh512_4way_close(void *cc, void *dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // AVX2
|
||||
|
||||
#endif
|
||||
|
@@ -6,11 +6,591 @@
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(64))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
|
||||
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
|
||||
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
|
||||
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
|
||||
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
|
||||
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
|
||||
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
|
||||
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
|
||||
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
|
||||
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
|
||||
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
|
||||
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
|
||||
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
|
||||
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
|
||||
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
|
||||
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
|
||||
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
|
||||
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
|
||||
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
|
||||
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
|
||||
0x00000000,0x00000000,0x00000000,0x5090d577,
|
||||
0x00000000,0x00000000,0x00000000,0xac11d7fa,
|
||||
0x00000000,0x00000000,0x00000000,0x2d1925ab,
|
||||
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
|
||||
0x00000000,0x00000000,0x00000000,0xb46496ac,
|
||||
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
|
||||
0x00000000,0x00000000,0x00000000,0xd1925ab0,
|
||||
0x00000000,0x00000000,0x00000000,0x78602649,
|
||||
0x00000000,0x00000000,0x00000000,0x29131ab6,
|
||||
0x00000000,0x00000000,0x00000000,0x8edae952,
|
||||
0x00000000,0x00000000,0x00000000,0x0fc053c3,
|
||||
0x00000000,0x00000000,0x00000000,0x3b6ba548,
|
||||
0x00000000,0x00000000,0x00000000,0x3f014f0c,
|
||||
0x00000000,0x00000000,0x00000000,0xedae9520,
|
||||
0x00000000,0x00000000,0x00000000,0xfc053c31
|
||||
};
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define cns4w(i) m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT4W(a,b,c0,c1)\
|
||||
a = _mm512_xor_si512(a,c0);\
|
||||
b = _mm512_xor_si512(b,c1);
|
||||
|
||||
#define MULT24W( a0, a1, mask ) \
|
||||
do { \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
_mm512_shuffle_epi32( _mm512_and_si512(a1,mask), 16 ) ); \
|
||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
|
||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
|
||||
} while(0)
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART4W(x,c0,c1,t)\
|
||||
SUBCRUMB4W(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB4W(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD4W(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD4W(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT4W(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB4W(a0,a1,a2,a3,t)\
|
||||
t = _mm512_load_si512(&a0);\
|
||||
a0 = _mm512_or_si512(a0,a1);\
|
||||
a2 = _mm512_xor_si512(a2,a3);\
|
||||
a1 = _mm512_andnot_si512(a1, m512_neg1 );\
|
||||
a0 = _mm512_xor_si512(a0,a3);\
|
||||
a3 = _mm512_and_si512(a3,t);\
|
||||
a1 = _mm512_xor_si512(a1,a3);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
a2 = _mm512_and_si512(a2,a0);\
|
||||
a0 = _mm512_andnot_si512(a0, m512_neg1 );\
|
||||
a2 = _mm512_xor_si512(a2,a1);\
|
||||
a1 = _mm512_or_si512(a1,a3);\
|
||||
t = _mm512_xor_si512(t,a1);\
|
||||
a3 = _mm512_xor_si512(a3,a2);\
|
||||
a2 = _mm512_and_si512(a2,a1);\
|
||||
a1 = _mm512_xor_si512(a1,a0);\
|
||||
a0 = _mm512_load_si512(&t);
|
||||
|
||||
#define MIXWORD4W(a,b,t1,t2)\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,2);\
|
||||
t2 = _mm512_srli_epi32(a,30);\
|
||||
a = _mm512_or_si512(t1,t2);\
|
||||
a = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(b,14);\
|
||||
t2 = _mm512_srli_epi32(b,18);\
|
||||
b = _mm512_or_si512(t1,t2);\
|
||||
b = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(a,10);\
|
||||
t2 = _mm512_srli_epi32(a,22);\
|
||||
a = _mm512_or_si512(t1,t2);\
|
||||
a = _mm512_xor_si512(a,b);\
|
||||
t1 = _mm512_slli_epi32(b,1);\
|
||||
t2 = _mm512_srli_epi32(b,31);\
|
||||
b = _mm512_or_si512(t1,t2);
|
||||
|
||||
#define STEP_PART24W(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm512_shuffle_epi32(a1,147);\
|
||||
t0 = _mm512_load_si512(&a1);\
|
||||
a1 = _mm512_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm512_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm512_shuffle_epi32(t0,78);\
|
||||
a0 = _mm512_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB4W(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm512_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm512_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm512_load_si512(&a1);\
|
||||
a0 = _mm512_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm512_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm512_shuffle_epi32(a1,57);\
|
||||
MIXWORD4W(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT4W(a0,a1,c0,c1);
|
||||
|
||||
#define NMLTOM7684W(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm512_load_si512(&r1);\
|
||||
q2 = _mm512_load_si512(&p1);\
|
||||
r2 = _mm512_shuffle_epi32(r2,216);\
|
||||
p2 = _mm512_shuffle_epi32(p2,216);\
|
||||
r1 = _mm512_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm512_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm512_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm512_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm512_load_si512(&r2);\
|
||||
q0 = _mm512_load_si512(&p2);\
|
||||
r2 = _mm512_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm512_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm512_load_si512(&s0);\
|
||||
q1 = _mm512_load_si512(&q0);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm512_shuffle_epi32(r2,225);\
|
||||
p2 = _mm512_shuffle_epi32(p2,225);\
|
||||
r0 = _mm512_load_si512(&s1);\
|
||||
p0 = _mm512_load_si512(&q1);\
|
||||
s0 = _mm512_shuffle_epi32(s0,225);\
|
||||
q0 = _mm512_shuffle_epi32(q0,225);\
|
||||
s1 = _mm512_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm512_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm512_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm512_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm512_load_si512(&r0);\
|
||||
q2 = _mm512_load_si512(&p0);\
|
||||
s3 = _mm512_load_si512(&r2);\
|
||||
q3 = _mm512_load_si512(&p2);
|
||||
|
||||
#define MIXTON7684W(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm512_load_si512(&r0);\
|
||||
q0 = _mm512_load_si512(&p0);\
|
||||
s1 = _mm512_load_si512(&r2);\
|
||||
q1 = _mm512_load_si512(&p2);\
|
||||
r0 = _mm512_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm512_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm512_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm512_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm512_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm512_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm512_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm512_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm512_load_si512(&r0);\
|
||||
p1 = _mm512_load_si512(&p0);\
|
||||
r0 = _mm512_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm512_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm512_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm512_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm512_load_si512(&r0);\
|
||||
q2 = _mm512_load_si512(&p0);\
|
||||
s1 = _mm512_load_si512(&r1);\
|
||||
q1 = _mm512_load_si512(&p1);
|
||||
|
||||
#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm512_load_si512(&r3);\
|
||||
q1 = _mm512_load_si512(&p3);\
|
||||
s3 = _mm512_load_si512(&r3);\
|
||||
q3 = _mm512_load_si512(&p3);\
|
||||
s1 = _mm512_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm512_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm512_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm512_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm512_load_si512(&s1);\
|
||||
q0 = _mm512_load_si512(&q1);\
|
||||
s2 = _mm512_load_si512(&s3);\
|
||||
q2 = _mm512_load_si512(&q3);\
|
||||
r3 = _mm512_load_si512(&r1);\
|
||||
p3 = _mm512_load_si512(&p1);\
|
||||
r1 = _mm512_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm512_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm512_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm512_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm512_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm512_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm512_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm512_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm512_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm512_unpacklo_epi64(q3,p1);
|
||||
|
||||
#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
|
||||
void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
{
|
||||
__m512i t0, t1;
|
||||
__m512i *chainv = state->chainv;
|
||||
__m512i msg0, msg1;
|
||||
__m512i tmp[2];
|
||||
__m512i x[8];
|
||||
const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
t0 = _mm512_xor_si512( t0, chainv[2] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[3] );
|
||||
t0 = _mm512_xor_si512( t0, chainv[4] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[5] );
|
||||
t0 = _mm512_xor_si512( t0, chainv[6] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[7] );
|
||||
t0 = _mm512_xor_si512( t0, chainv[8] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[9] );
|
||||
|
||||
MULT24W( t0, t1, MASK );
|
||||
|
||||
msg0 = _mm512_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm512_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
chainv[0] = _mm512_xor_si512( chainv[0], t0 );
|
||||
chainv[1] = _mm512_xor_si512( chainv[1], t1 );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], t0 );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], t1 );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], t0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], t1 );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], t0 );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], t1 );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT24W( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
|
||||
|
||||
MULT24W( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
|
||||
|
||||
MULT24W( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
|
||||
|
||||
MULT24W( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
|
||||
|
||||
MULT24W( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
|
||||
MULT24W( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
|
||||
|
||||
MULT24W( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
|
||||
|
||||
MULT24W( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
|
||||
|
||||
MULT24W( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
||||
|
||||
MULT24W( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||
|
||||
MULT24W( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
||||
|
||||
MULT24W( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
|
||||
// replace with ror
|
||||
chainv[3] = _mm512_or_si512( _mm512_slli_epi32( chainv[3], 1 ),
|
||||
_mm512_srli_epi32( chainv[3], 31 ) );
|
||||
chainv[5] = _mm512_or_si512( _mm512_slli_epi32( chainv[5], 2 ),
|
||||
_mm512_srli_epi32( chainv[5], 30 ) );
|
||||
chainv[7] = _mm512_or_si512( _mm512_slli_epi32( chainv[7], 3 ),
|
||||
_mm512_srli_epi32( chainv[7], 29 ) );
|
||||
chainv[9] = _mm512_or_si512( _mm512_slli_epi32( chainv[9], 4 ),
|
||||
_mm512_srli_epi32( chainv[9], 28 ) );
|
||||
|
||||
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
|
||||
STEP_PART4W( &x[0], cns4w( 0), cns4w( 1), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 2), cns4w( 3), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 4), cns4w( 5), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 6), cns4w( 7), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w( 8), cns4w( 9), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(10), cns4w(11), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(12), cns4w(13), &tmp[0] );
|
||||
STEP_PART4W( &x[0], cns4w(14), cns4w(15), &tmp[0] );
|
||||
|
||||
MIXTON10244W( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(16), cns4w(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(18), cns4w(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(20), cns4w(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(22), cns4w(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(24), cns4w(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(26), cns4w(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(28), cns4w(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART24W( chainv[8], chainv[9], t0, t1, cns4w(30), cns4w(31),
|
||||
tmp[0], tmp[1] );
|
||||
}
|
||||
|
||||
void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8*4] __attribute((aligned(128)));
|
||||
__m512i* chainv = state->chainv;
|
||||
__m512i t[2];
|
||||
__m512i zero[2];
|
||||
zero[0] = zero[1] = m512_zero;
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_4way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[2] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[3] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[4] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[5] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[6] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[7] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[8] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm512_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[8], t[1] );
|
||||
|
||||
casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m512i( b, 1 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 1 ), shuff_bswap32 );
|
||||
|
||||
rnd512_4way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[2] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[3] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[4] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[5] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[6] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[7] );
|
||||
t[0] = _mm512_xor_si512( t[0], chainv[8] );
|
||||
t[1] = _mm512_xor_si512( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm512_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[8], t[1] );
|
||||
|
||||
casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m512i( b, 3 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 1 ), shuff_bswap32 );
|
||||
}
|
||||
|
||||
int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
|
||||
{
|
||||
state->hashbitlen = hashbitlen;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m512_const1_128( iv[0] );
|
||||
state->chainv[1] = m512_const1_128( iv[1] );
|
||||
state->chainv[2] = m512_const1_128( iv[2] );
|
||||
state->chainv[3] = m512_const1_128( iv[3] );
|
||||
state->chainv[4] = m512_const1_128( iv[4] );
|
||||
state->chainv[5] = m512_const1_128( iv[5] );
|
||||
state->chainv[6] = m512_const1_128( iv[6] );
|
||||
state->chainv[7] = m512_const1_128( iv[7] );
|
||||
state->chainv[8] = m512_const1_128( iv[8] );
|
||||
state->chainv[9] = m512_const1_128( iv[9] );
|
||||
|
||||
((__m512i*)state->buffer)[0] = m512_zero;
|
||||
((__m512i*)state->buffer)[1] = m512_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Do not call luffa_update_close after having called luffa_update.
|
||||
// Once luffa_update has been called only call luffa_update or luffa_close.
|
||||
int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buffer = (__m512i*)state->buffer;
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
// store in buffer for transform in final for midstate to work
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_4way_close( luffa_4way_context *state, void *hashval )
|
||||
{
|
||||
__m512i *buffer = (__m512i*)state->buffer;
|
||||
__m512i msg[2];
|
||||
|
||||
// transform pad block
|
||||
if ( state->rembytes )
|
||||
// not empty, data is in buffer
|
||||
rnd512_4way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
finalization512_4way( state, (uint32*)hashval );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_4way( state, (uint32*)( hashval+32 ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_4way_update_close( luffa_4way_context *state,
|
||||
void *output, const void *data, size_t inlen )
|
||||
{
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
const __m512i *vdata = (__m512i*)data;
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
finalization512_4way( state, (uint32*)output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_4way( state, (uint32*)( output+32 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm256_xor_si256(a,c0);\
|
||||
b = _mm256_xor_si256(b,c1);\
|
||||
b = _mm256_xor_si256(b,c1);
|
||||
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
do { \
|
||||
@@ -115,7 +695,7 @@ do { \
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s3 = _mm256_load_si256(&r2);\
|
||||
q3 = _mm256_load_si256(&p2);\
|
||||
q3 = _mm256_load_si256(&p2);
|
||||
|
||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm256_load_si256(&r0);\
|
||||
@@ -174,57 +754,6 @@ do { \
|
||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(32))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
|
||||
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
|
||||
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
|
||||
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
|
||||
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
|
||||
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
|
||||
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
|
||||
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
|
||||
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
|
||||
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
|
||||
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
|
||||
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
|
||||
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
|
||||
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
|
||||
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
|
||||
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
|
||||
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
|
||||
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
|
||||
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
|
||||
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
|
||||
0x00000000,0x00000000,0x00000000,0x5090d577,
|
||||
0x00000000,0x00000000,0x00000000,0xac11d7fa,
|
||||
0x00000000,0x00000000,0x00000000,0x2d1925ab,
|
||||
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
|
||||
0x00000000,0x00000000,0x00000000,0xb46496ac,
|
||||
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
|
||||
0x00000000,0x00000000,0x00000000,0xd1925ab0,
|
||||
0x00000000,0x00000000,0x00000000,0x78602649,
|
||||
0x00000000,0x00000000,0x00000000,0x29131ab6,
|
||||
0x00000000,0x00000000,0x00000000,0x8edae952,
|
||||
0x00000000,0x00000000,0x00000000,0x0fc053c3,
|
||||
0x00000000,0x00000000,0x00000000,0x3b6ba548,
|
||||
0x00000000,0x00000000,0x00000000,0x3f014f0c,
|
||||
0x00000000,0x00000000,0x00000000,0xedae9520,
|
||||
0x00000000,0x00000000,0x00000000,0xfc053c31
|
||||
};
|
||||
|
||||
|
||||
|
||||
/***************************************************/
|
||||
/* Round function */
|
||||
@@ -385,13 +914,15 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
|
||||
void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
uint32 hash[8*2] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t[2];
|
||||
__m256i zero[2];
|
||||
zero[0] = zero[1] = m256_zero;
|
||||
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
@@ -475,8 +1006,10 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
@@ -528,8 +1061,10 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
|
573
algo/luffa/luffa-hash-2way.c.save
Normal file
573
algo/luffa/luffa-hash-2way.c.save
Normal file
@@ -0,0 +1,573 @@
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include "luffa-hash-2way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm256_xor_si256(a,c0);\
|
||||
b = _mm256_xor_si256(b,c1);\
|
||||
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
do { \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
|
||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART(x,c0,c1,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = _mm256_load_si256(&a0);\
|
||||
a0 = _mm256_or_si256(a0,a1);\
|
||||
a2 = _mm256_xor_si256(a2,a3);\
|
||||
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
|
||||
a0 = _mm256_xor_si256(a0,a3);\
|
||||
a3 = _mm256_and_si256(a3,t);\
|
||||
a1 = _mm256_xor_si256(a1,a3);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a0);\
|
||||
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
|
||||
a2 = _mm256_xor_si256(a2,a1);\
|
||||
a1 = _mm256_or_si256(a1,a3);\
|
||||
t = _mm256_xor_si256(t,a1);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a1);\
|
||||
a1 = _mm256_xor_si256(a1,a0);\
|
||||
a0 = _mm256_load_si256(&t);\
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,2);\
|
||||
t2 = _mm256_srli_epi32(a,30);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,14);\
|
||||
t2 = _mm256_srli_epi32(b,18);\
|
||||
b = _mm256_or_si256(t1,t2);\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,10);\
|
||||
t2 = _mm256_srli_epi32(a,22);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,1);\
|
||||
t2 = _mm256_srli_epi32(b,31);\
|
||||
b = _mm256_or_si256(t1,t2);
|
||||
|
||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm256_shuffle_epi32(a1,147);\
|
||||
t0 = _mm256_load_si256(&a1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm256_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm256_shuffle_epi32(t0,78);\
|
||||
a0 = _mm256_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm256_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm256_load_si256(&a1);\
|
||||
a0 = _mm256_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm256_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm256_shuffle_epi32(a1,57);\
|
||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT(a0,a1,c0,c1);
|
||||
|
||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm256_load_si256(&r1);\
|
||||
q2 = _mm256_load_si256(&p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,216);\
|
||||
p2 = _mm256_shuffle_epi32(p2,216);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm256_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm256_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm256_load_si256(&r2);\
|
||||
q0 = _mm256_load_si256(&p2);\
|
||||
r2 = _mm256_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm256_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm256_load_si256(&s0);\
|
||||
q1 = _mm256_load_si256(&q0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,225);\
|
||||
p2 = _mm256_shuffle_epi32(p2,225);\
|
||||
r0 = _mm256_load_si256(&s1);\
|
||||
p0 = _mm256_load_si256(&q1);\
|
||||
s0 = _mm256_shuffle_epi32(s0,225);\
|
||||
q0 = _mm256_shuffle_epi32(q0,225);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s3 = _mm256_load_si256(&r2);\
|
||||
q3 = _mm256_load_si256(&p2);\
|
||||
|
||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm256_load_si256(&r0);\
|
||||
q0 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r2);\
|
||||
q1 = _mm256_load_si256(&p2);\
|
||||
r0 = _mm256_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm256_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm256_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm256_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm256_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm256_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm256_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm256_load_si256(&r0);\
|
||||
p1 = _mm256_load_si256(&p0);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm256_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm256_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r1);\
|
||||
q1 = _mm256_load_si256(&p1);\
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm256_load_si256(&r3);\
|
||||
q1 = _mm256_load_si256(&p3);\
|
||||
s3 = _mm256_load_si256(&r3);\
|
||||
q3 = _mm256_load_si256(&p3);\
|
||||
s1 = _mm256_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm256_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm256_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm256_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm256_load_si256(&s1);\
|
||||
q0 = _mm256_load_si256(&q1);\
|
||||
s2 = _mm256_load_si256(&s3);\
|
||||
q2 = _mm256_load_si256(&q3);\
|
||||
r3 = _mm256_load_si256(&r1);\
|
||||
p3 = _mm256_load_si256(&p1);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm256_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm256_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm256_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm256_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm256_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm256_unpacklo_epi64(q3,p1);
|
||||
|
||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(32))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
|
||||
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
|
||||
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
|
||||
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
|
||||
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
|
||||
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
|
||||
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
|
||||
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
|
||||
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
|
||||
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
|
||||
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
|
||||
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
|
||||
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
|
||||
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
|
||||
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
|
||||
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
|
||||
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
|
||||
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
|
||||
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
|
||||
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
|
||||
0x00000000,0x00000000,0x00000000,0x5090d577,
|
||||
0x00000000,0x00000000,0x00000000,0xac11d7fa,
|
||||
0x00000000,0x00000000,0x00000000,0x2d1925ab,
|
||||
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
|
||||
0x00000000,0x00000000,0x00000000,0xb46496ac,
|
||||
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
|
||||
0x00000000,0x00000000,0x00000000,0xd1925ab0,
|
||||
0x00000000,0x00000000,0x00000000,0x78602649,
|
||||
0x00000000,0x00000000,0x00000000,0x29131ab6,
|
||||
0x00000000,0x00000000,0x00000000,0x8edae952,
|
||||
0x00000000,0x00000000,0x00000000,0x0fc053c3,
|
||||
0x00000000,0x00000000,0x00000000,0x3b6ba548,
|
||||
0x00000000,0x00000000,0x00000000,0x3f014f0c,
|
||||
0x00000000,0x00000000,0x00000000,0xedae9520,
|
||||
0x00000000,0x00000000,0x00000000,0xfc053c31
|
||||
};
|
||||
|
||||
|
||||
|
||||
/***************************************************/
|
||||
/* Round function */
|
||||
/* state: hash context */
|
||||
|
||||
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
{
|
||||
__m256i t0, t1;
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i tmp[2];
|
||||
__m256i x[8];
|
||||
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
t0 = _mm256_xor_si256( t0, chainv[2] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[3] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[4] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[5] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[6] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[7] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[8] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[9] );
|
||||
|
||||
MULT2( t0, t1, MASK );
|
||||
|
||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
|
||||
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
|
||||
_mm256_srli_epi32( chainv[3], 31 ) );
|
||||
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
|
||||
_mm256_srli_epi32( chainv[5], 30 ) );
|
||||
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
|
||||
_mm256_srli_epi32( chainv[7], 29 ) );
|
||||
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
|
||||
_mm256_srli_epi32( chainv[9], 28 ) );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
|
||||
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
|
||||
|
||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
|
||||
tmp[0], tmp[1] );
|
||||
}
|
||||
|
||||
/***************************************************/
|
||||
/* Finalization function */
|
||||
/* state: hash context */
|
||||
/* b[8]: hash values */
|
||||
|
||||
void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t[2];
|
||||
__m256i zero[2];
|
||||
zero[0] = zero[1] = m256_zero;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
}
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
||||
{
|
||||
state->hashbitlen = hashbitlen;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m256_const1_128( iv[0] );
|
||||
state->chainv[1] = m256_const1_128( iv[1] );
|
||||
state->chainv[2] = m256_const1_128( iv[2] );
|
||||
state->chainv[3] = m256_const1_128( iv[3] );
|
||||
state->chainv[4] = m256_const1_128( iv[4] );
|
||||
state->chainv[5] = m256_const1_128( iv[5] );
|
||||
state->chainv[6] = m256_const1_128( iv[6] );
|
||||
state->chainv[7] = m256_const1_128( iv[7] );
|
||||
state->chainv[8] = m256_const1_128( iv[8] );
|
||||
state->chainv[9] = m256_const1_128( iv[9] );
|
||||
|
||||
((__m256i*)state->buffer)[0] = m256_zero;
|
||||
((__m256i*)state->buffer)[1] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Do not call luffa_update_close after having called luffa_update.
|
||||
// Once luffa_update has been called only call luffa_update or luffa_close.
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
// store in buffer for transform in final for midstate to work
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
||||
{
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
|
||||
// transform pad block
|
||||
if ( state->rembytes )
|
||||
// not empty, data is in buffer
|
||||
rnd512_2way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
finalization512_2way( state, (uint32*)hashval );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( hashval+32 ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_update_close( luffa_2way_context *state,
|
||||
void *output, const void *data, size_t inlen )
|
||||
{
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
const __m256i *vdata = (__m256i*)data;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
finalization512_2way( state, (uint32*)output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -51,12 +51,30 @@
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2] __attribute((aligned(64)));
|
||||
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
||||
uint32 buffer[8*4];
|
||||
__m512i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context;
|
||||
} luffa_4way_context __attribute((aligned(128)));
|
||||
|
||||
int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
|
||||
int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa_4way_close( luffa_4way_context *state, void *hashval );
|
||||
int luffa_4way_update_close( luffa_4way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2];
|
||||
__m256i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context __attribute((aligned(128)));
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
|
69
algo/luffa/luffa-hash-2way.h.save
Normal file
69
algo/luffa/luffa-hash-2way.h.save
Normal file
@@ -0,0 +1,69 @@
|
||||
#if !defined(LUFFA_HASH_2WAY_H__)
|
||||
#define LUFFA_HASH_2WAY_H__ 1
|
||||
/*
|
||||
* luffa_for_sse2.h
|
||||
* Version 2.0 (Sep 15th 2009)
|
||||
*
|
||||
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
|
||||
*
|
||||
* Hitachi, Ltd. is the owner of this software and hereby grant
|
||||
* the U.S. Government and any interested party the right to use
|
||||
* this software for the purposes of the SHA-3 evaluation process,
|
||||
* notwithstanding that this software is copyrighted.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/* The length of digests*/
|
||||
#define DIGEST_BIT_LEN_224 224
|
||||
#define DIGEST_BIT_LEN_256 256
|
||||
#define DIGEST_BIT_LEN_384 384
|
||||
#define DIGEST_BIT_LEN_512 512
|
||||
|
||||
/*********************************/
|
||||
/* The parameters of Luffa */
|
||||
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
|
||||
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
|
||||
* of a message block*/
|
||||
|
||||
/* The number of blocks in Luffa */
|
||||
#define WIDTH_224 3
|
||||
#define WIDTH_256 3
|
||||
#define WIDTH_384 4
|
||||
#define WIDTH_512 5
|
||||
|
||||
/* The limit of the length of message */
|
||||
#define LIMIT_224 64
|
||||
#define LIMIT_256 64
|
||||
#define LIMIT_384 128
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2] __attribute((aligned(64)));
|
||||
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context;
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval );
|
||||
int luffa_2way_update_close( luffa_2way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -542,8 +542,10 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
__m256i* chainv = (__m256i*)state->chainv;
|
||||
__m256i t;
|
||||
const __m128i zero = m128_zero;
|
||||
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
|
@@ -3,22 +3,129 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(NIST5_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
|
||||
void nist5hash_4way( void *out, const void *input )
|
||||
#if defined(NIST5_8WAY)
|
||||
|
||||
void nist5hash_8way( void *out, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*16] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
|
||||
blake512_8way_context ctx_blake;
|
||||
hashState_groestl ctx_groestl;
|
||||
jh512_8way_context ctx_jh;
|
||||
skein512_8way_context ctx_skein;
|
||||
keccak512_8way_context ctx_keccak;
|
||||
|
||||
blake512_8way_init( &ctx_blake );
|
||||
blake512_8way_update( &ctx_blake, input, 80 );
|
||||
blake512_8way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash0,
|
||||
(const char*)hash0, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash1,
|
||||
(const char*)hash1, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash2,
|
||||
(const char*)hash2, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash3,
|
||||
(const char*)hash3, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash4,
|
||||
(const char*)hash4, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash5,
|
||||
(const char*)hash5, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash6,
|
||||
(const char*)hash6, 512 );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash7,
|
||||
(const char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 512 );
|
||||
|
||||
jh512_8way_init( &ctx_jh );
|
||||
jh512_8way_update( &ctx_jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx_jh, vhash );
|
||||
|
||||
keccak512_8way_init( &ctx_keccak );
|
||||
keccak512_8way_update( &ctx_keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx_keccak, vhash );
|
||||
|
||||
skein512_8way_init( &ctx_skein );
|
||||
skein512_8way_update( &ctx_skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx_skein, out );
|
||||
}
|
||||
|
||||
int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
nist5hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(NIST5_4WAY)
|
||||
|
||||
void nist5hash_4way( void *out, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake512_4way_context ctx_blake;
|
||||
hashState_groestl ctx_groestl;
|
||||
jh512_4way_context ctx_jh;
|
||||
@@ -62,62 +169,39 @@ void nist5hash_4way( void *out, const void *input )
|
||||
int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[4*24] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]);
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
|
||||
uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for ( int m=0; m < 6; m++ )
|
||||
{
|
||||
if (Htarg <= htmax[m])
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
nist5hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
nist5hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash7[ lane ] & mask ) == 0 )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
}
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -2,8 +2,11 @@
|
||||
|
||||
bool register_nist5_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
#if defined (NIST5_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined (NIST5_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_nist5_8way;
|
||||
gate->hash = (void*)&nist5hash_8way;
|
||||
#elif defined (NIST5_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_nist5_4way;
|
||||
gate->hash = (void*)&nist5hash_4way;
|
||||
#else
|
||||
|
@@ -1,14 +1,23 @@
|
||||
#ifndef __NIST5_GATE_H__
|
||||
#define __NIST5_GATE_H__
|
||||
#define __NIST5_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define NIST5_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define NIST5_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define NIST5_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(NIST5_4WAY)
|
||||
#if defined(NIST5_8WAY)
|
||||
|
||||
void nist5hash_8way( void *state, const void *input );
|
||||
|
||||
int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(NIST5_4WAY)
|
||||
|
||||
void nist5hash_4way( void *state, const void *input );
|
||||
|
||||
|
@@ -1,12 +1,8 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "quark-gate.h"
|
||||
|
||||
#if defined (QUARK_4WAY)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -14,6 +10,258 @@
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
|
||||
#if defined (QUARK_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
jh512_8way_context jh;
|
||||
skein512_8way_context skein;
|
||||
keccak512_8way_context keccak;
|
||||
} quark_8way_ctx_holder;
|
||||
|
||||
quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
|
||||
|
||||
void init_quark_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &quark_8way_ctx.blake );
|
||||
bmw512_8way_init( &quark_8way_ctx.bmw );
|
||||
init_groestl( &quark_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &quark_8way_ctx.skein );
|
||||
jh512_8way_init( &quark_8way_ctx.jh );
|
||||
keccak512_8way_init( &quark_8way_ctx.keccak );
|
||||
}
|
||||
|
||||
void quark_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
__m512i* vh = (__m512i*)vhash;
|
||||
__m512i* vhA = (__m512i*)vhashA;
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
__mmask8 vh_mask;
|
||||
quark_8way_ctx_holder ctx;
|
||||
const uint32_t mask = 8;
|
||||
const __m512i bit3_mask = m512_const1_64( mask );
|
||||
const __m512i zero = _mm512_setzero_si512();
|
||||
|
||||
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
|
||||
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
blake512_8way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
// AVX 512 cmpeq returns a bit mask instead of a vector mask.
|
||||
// This should simplify things but the logic doesn't seem to be working.
|
||||
// The problem appears to be related to the test to skip a hash if it isn't
|
||||
// to be used. Skipping the test for all 8 way hashes seems to have
|
||||
// fixed it. The hash selection blending works if the hash is produced
|
||||
// but the hash wasn't being produced when it should.
|
||||
// Both decisions are based on the same data, the __mmask8. It works
|
||||
// as a blend mask but not in a logical comparison, maybe the type is the
|
||||
// problem. Maybe a cast to int or movm is needed to make it work.
|
||||
// It's now moot because the hash can only be skipped 1 in 256 iterations
|
||||
// when hashing parallel 8 ways.
|
||||
// The performance impact of the workaround should be negligible.
|
||||
// It's a problem for another day.
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
if ( hash0[0] & mask )
|
||||
{
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0,
|
||||
(char*)hash0, 512 );
|
||||
}
|
||||
if ( hash1[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1,
|
||||
(char*)hash1, 512 );
|
||||
}
|
||||
if ( hash2[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2,
|
||||
(char*)hash2, 512 );
|
||||
}
|
||||
if ( hash3[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3,
|
||||
(char*)hash3, 512 );
|
||||
}
|
||||
if ( hash4[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4,
|
||||
(char*)hash4, 512 );
|
||||
}
|
||||
if ( hash5[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5,
|
||||
(char*)hash5, 512 );
|
||||
}
|
||||
if ( hash6[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6,
|
||||
(char*)hash6, 512 );
|
||||
}
|
||||
if ( hash7[0] & mask )
|
||||
{
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7,
|
||||
(char*)hash7, 512 );
|
||||
}
|
||||
|
||||
intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 512 );
|
||||
|
||||
if ( vh_mask & 0xff )
|
||||
{
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhashB );
|
||||
}
|
||||
|
||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
reinit_groestl( &ctx.groestl );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
512 );
|
||||
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
blake512_8way_init( &ctx.blake );
|
||||
blake512_8way_update( &ctx.blake, vhash, 64 );
|
||||
blake512_8way_close( &ctx.blake, vhashA );
|
||||
}
|
||||
|
||||
if ( vh_mask & 0xff )
|
||||
{
|
||||
bmw512_8way_init( &ctx.bmw );
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhashB );
|
||||
}
|
||||
|
||||
mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
|
||||
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
skein512_8way_init( &ctx.skein );
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
|
||||
zero );
|
||||
|
||||
if ( ( vh_mask & 0xff ) != 0xff )
|
||||
{
|
||||
keccak512_8way_init( &ctx.keccak );
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhashA );
|
||||
}
|
||||
|
||||
if ( vh_mask & 0xff )
|
||||
{
|
||||
jh512_8way_init( &ctx.jh );
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhashB );
|
||||
}
|
||||
|
||||
// Final blend, directly to state, only need 32 bytes.
|
||||
casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
|
||||
casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
|
||||
casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
|
||||
casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
|
||||
}
|
||||
|
||||
int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
quark_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, i, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, lane_hash, mythr, i );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined (QUARK_4WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
@@ -91,7 +339,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
|
||||
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
skein512_4way( &ctx.skein, vhash, 64 );
|
||||
skein512_4way_close( &ctx.skein, vhashB );
|
||||
@@ -117,14 +365,14 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
blake512_4way_init( &ctx.blake );
|
||||
blake512_4way( &ctx.blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx.blake, vhashA );
|
||||
}
|
||||
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way( &ctx.bmw, vhash, 64 );
|
||||
@@ -142,14 +390,14 @@ void quark_4way_hash( void *state, const void *input )
|
||||
|
||||
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
|
||||
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
{
|
||||
keccak512_4way_init( &ctx.keccak );
|
||||
keccak512_4way( &ctx.keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx.keccak, vhashA );
|
||||
}
|
||||
|
||||
if ( mm256_anybits0( vh_mask ) )
|
||||
if ( mm256_anybits1( vh_mask ) )
|
||||
{
|
||||
jh512_4way_init( &ctx.jh );
|
||||
jh512_4way( &ctx.jh, vhash, 64 );
|
||||
|
@@ -2,7 +2,11 @@
|
||||
|
||||
bool register_quark_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (QUARK_4WAY)
|
||||
#if defined (QUARK_8WAY)
|
||||
init_quark_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_quark_8way;
|
||||
gate->hash = (void*)&quark_8way_hash;
|
||||
#elif defined (QUARK_4WAY)
|
||||
init_quark_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_quark_4way;
|
||||
gate->hash = (void*)&quark_4way_hash;
|
||||
@@ -11,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_quark;
|
||||
gate->hash = (void*)&quark_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,13 +4,22 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define QUARK_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define QUARK_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define QUARK_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_quark_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(QUARK_4WAY)
|
||||
#if defined(QUARK_8WAY)
|
||||
|
||||
void quark_8way_hash( void *state, const void *input );
|
||||
int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_quark_8way_ctx();
|
||||
|
||||
#elif defined(QUARK_4WAY)
|
||||
|
||||
void quark_4way_hash( void *state, const void *input );
|
||||
int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "qubit-gate.h"
|
||||
|
||||
#if defined(QUBIT_2WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -12,6 +9,160 @@
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
#if defined(QUBIT_4WAY)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
luffa_4way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
} qubit_4way_ctx_holder;
|
||||
|
||||
qubit_4way_ctx_holder qubit_4way_ctx;
|
||||
|
||||
void init_qubit_4way_ctx()
|
||||
{
|
||||
cubehashInit(&qubit_4way_ctx.cube,512,16,32);
|
||||
sph_shavite512_init(&qubit_4way_ctx.shavite);
|
||||
simd_4way_init( &qubit_4way_ctx.simd, 512 );
|
||||
init_echo(&qubit_4way_ctx.echo, 512);
|
||||
};
|
||||
|
||||
void qubit_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
qubit_4way_ctx_holder ctx;
|
||||
|
||||
memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
|
||||
luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
|
||||
luffa_4way_close( &ctx.luffa, vhash );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
memcpy( output+32, hash1, 32 );
|
||||
memcpy( output+64, hash2, 32 );
|
||||
memcpy( output+96, hash3, 32 );
|
||||
}
|
||||
|
||||
int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[4*24] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
casti_m512i( endiandata, 0 ) = mm512_bswap_32( casti_m512i( pdata, 0 ) );
|
||||
casti_m512i( endiandata, 1 ) = mm512_bswap_32( casti_m512i( pdata, 1 ) );
|
||||
casti_m512i( endiandata, 4 ) = mm512_bswap_32( casti_m512i( pdata, 4 ) );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
intrlv_4x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||
|
||||
luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
|
||||
luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
|
||||
|
||||
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+4, n+1 );
|
||||
be32enc( noncep+8, n+2 );
|
||||
be32enc( noncep+12, n+3 );
|
||||
qubit_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
if ( !( hash[7] & mask ) )
|
||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_lane_solution( work, hash, mythr, 0 );
|
||||
}
|
||||
if ( !( (hash+8)[7] & mask ) )
|
||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_lane_solution( work, hash+8, mythr, 1 );
|
||||
}
|
||||
if ( !( hash+16[7] & mask ) )
|
||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+2;
|
||||
submit_lane_solution( work, hash, mythr, 2 );
|
||||
}
|
||||
if ( !( (hash+24)[7] & mask ) )
|
||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+3;
|
||||
submit_lane_solution( work, hash+8, mythr, 3 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(QUBIT_2WAY)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
luffa_2way_context luffa;
|
||||
|
@@ -2,6 +2,13 @@
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate )
|
||||
{
|
||||
/*
|
||||
#if defined (QUBIT_4WAY)
|
||||
init_qubit_2way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit_4way;
|
||||
gate->hash = (void*)&qubit_4way_hash;
|
||||
#elif defined (QUBIT_4WAY)
|
||||
*/
|
||||
#if defined (QUBIT_2WAY)
|
||||
init_qubit_2way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit_2way;
|
||||
|
@@ -4,12 +4,26 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define QUBIT_2WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
*/
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define QUBIT_2WAY
|
||||
#define QUBIT_2WAY 1
|
||||
#endif
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate );
|
||||
/*
|
||||
#if defined(QUBIT_4WAY)
|
||||
|
||||
void qubit_4way_hash( void *state, const void *input );
|
||||
int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_qubit_4way_ctx();
|
||||
|
||||
#elif defined(QUBIT_2WAY)
|
||||
*/
|
||||
#if defined(QUBIT_2WAY)
|
||||
|
||||
void qubit_2way_hash( void *state, const void *input );
|
||||
|
@@ -285,8 +285,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 );
|
||||
const __m256i shuff_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f,
|
||||
0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -7,15 +7,37 @@
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
uint32_t A[ 32*2 ] __attribute__((aligned(64)));
|
||||
uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
|
||||
uint32_t A[ 32*4 ];
|
||||
uint8_t buffer[ 128*4 ];
|
||||
uint64_t count;
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
|
||||
} simd_4way_context __attribute__((aligned(128)));
|
||||
|
||||
int simd_4way_init( simd_4way_context *state, int hashbitlen );
|
||||
int simd_4way_update( simd_4way_context *state, const void *data,
|
||||
int databitlen );
|
||||
int simd_4way_close( simd_4way_context *state, void *hashval );
|
||||
int simd_4way_update_close( simd_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t A[ 32*2 ];
|
||||
uint8_t buffer[ 128*2 ];
|
||||
uint64_t count;
|
||||
unsigned int hashbitlen;
|
||||
unsigned int blocksize;
|
||||
unsigned int n_feistels;
|
||||
|
||||
} simd_2way_context;
|
||||
} simd_2way_context __attribute__((aligned(128)));
|
||||
|
||||
int simd_2way_init( simd_2way_context *state, int hashbitlen );
|
||||
int simd_2way_update( simd_2way_context *state, const void *data,
|
||||
|
@@ -15,7 +15,7 @@
|
||||
|
||||
void skeinhash_8way( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash64[16*8] __attribute__ ((aligned (128)));
|
||||
uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
|
||||
skein512_8way_context ctx_skein;
|
||||
|
||||
//#if defined(__SHA__)
|
||||
@@ -29,7 +29,7 @@ void skeinhash_8way( void *state, const void *input )
|
||||
// uint32_t hash7[16] __attribute__ ((aligned (64)));
|
||||
// SHA256_CTX ctx_sha256;
|
||||
//#else
|
||||
uint32_t vhash32[32*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
|
||||
sha256_8way_context ctx_sha256;
|
||||
//#endif
|
||||
|
||||
@@ -135,7 +135,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
void skeinhash_4way( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash64[16*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhash64[8*4] __attribute__ ((aligned (128)));
|
||||
skein512_4way_context ctx_skein;
|
||||
#if defined(__SHA__)
|
||||
uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||
|
@@ -3,22 +3,121 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(TRIBUS_4WAY)
|
||||
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
|
||||
static __thread jh512_4way_context ctx_mid;
|
||||
/*
|
||||
void init_tribus_4way_ctx()
|
||||
#if defined(TRIBUS_8WAY)
|
||||
|
||||
static __thread jh512_8way_context ctx_mid;
|
||||
|
||||
void tribus_hash_8way( void *state, const void *input )
|
||||
{
|
||||
init_echo( &tribus_4way_ctx, 512 );
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
jh512_8way_context ctx_jh;
|
||||
keccak512_8way_context ctx_keccak;
|
||||
hashState_echo ctx_echo;
|
||||
|
||||
memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
|
||||
jh512_8way_update( &ctx_jh, input + (64<<3), 16 );
|
||||
jh512_8way_close( &ctx_jh, vhash );
|
||||
|
||||
keccak512_8way_init( &ctx_keccak );
|
||||
keccak512_8way_update( &ctx_keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx_keccak, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash, 512 );
|
||||
|
||||
// hash echo serially
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash4,
|
||||
(const BitSequence *) hash4, 512 );
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash5,
|
||||
(const BitSequence *) hash5, 512 );
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash6,
|
||||
(const BitSequence *) hash6, 512 );
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo( &ctx_echo, (BitSequence *) hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
memcpy( state+64, hash2, 32 );
|
||||
memcpy( state+96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
*/
|
||||
void tribus_hash_4way(void *state, const void *input)
|
||||
|
||||
int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t n = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
jh512_8way_init( &ctx_mid );
|
||||
jh512_8way_update( &ctx_mid, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
tribus_hash_8way( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] < Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(TRIBUS_4WAY)
|
||||
|
||||
static __thread jh512_4way_context ctx_mid;
|
||||
|
||||
void tribus_hash_4way( void *state, const void *input )
|
||||
{
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
@@ -30,11 +129,11 @@ void tribus_hash_4way(void *state, const void *input)
|
||||
hashState_echo ctx_echo;
|
||||
|
||||
memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
|
||||
jh512_4way( &ctx_jh, input + (64<<2), 16 );
|
||||
jh512_4way_update( &ctx_jh, input + (64<<2), 16 );
|
||||
jh512_4way_close( &ctx_jh, vhash );
|
||||
|
||||
keccak512_4way_init( &ctx_keccak );
|
||||
keccak512_4way( &ctx_keccak, vhash, 64 );
|
||||
keccak512_4way_update( &ctx_keccak, vhash, 64 );
|
||||
keccak512_4way_close( &ctx_keccak, vhash );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
@@ -60,7 +159,7 @@ void tribus_hash_4way(void *state, const void *input)
|
||||
}
|
||||
|
||||
int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
@@ -70,57 +169,32 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t n = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
uint64_t htmax[] = { 0,
|
||||
0xF,
|
||||
0xFF,
|
||||
0xFFF,
|
||||
0xFFFF,
|
||||
0x10000000 };
|
||||
|
||||
uint32_t masks[] = { 0xFFFFFFFF,
|
||||
0xFFFFFFF0,
|
||||
0xFFFFFF00,
|
||||
0xFFFFF000,
|
||||
0xFFFF0000,
|
||||
0 };
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
// precalc midstate
|
||||
// doing it one way then then interleaving would be faster but too
|
||||
// complicated tto interleave context.
|
||||
jh512_4way_init( &ctx_mid );
|
||||
jh512_4way( &ctx_mid, vdata, 64 );
|
||||
jh512_4way_update( &ctx_mid, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ )
|
||||
{
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
tribus_hash_4way( hash, vdata );
|
||||
tribus_hash_4way( hash, vdata );
|
||||
|
||||
pdata[19] = n;
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( ( !( (hash+(i<<3))[7] & mask ) )
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart);
|
||||
break;
|
||||
}
|
||||
}
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
if ( (hash+(i<<3))[7] < Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -2,9 +2,11 @@
|
||||
|
||||
bool register_tribus_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
#if defined (TRIBUS_4WAY)
|
||||
// init_tribus_4way_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined (TRIBUS_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_tribus_8way;
|
||||
gate->hash = (void*)&tribus_hash_8way;
|
||||
#elif defined (TRIBUS_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_tribus_4way;
|
||||
gate->hash = (void*)&tribus_hash_4way;
|
||||
#else
|
||||
|
@@ -1,16 +1,23 @@
|
||||
#ifndef TRIBUS_GATE_H__
|
||||
#define TRIBUS_GATE_H__
|
||||
#define TRIBUS_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define TRIBUS_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define TRIBUS_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define TRIBUS_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(TRIBUS_4WAY)
|
||||
#if defined(TRIBUS_8WAY)
|
||||
|
||||
//void init_tribus_4way_ctx();
|
||||
void tribus_hash_8way( void *state, const void *input );
|
||||
|
||||
int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(TRIBUS_4WAY)
|
||||
|
||||
void tribus_hash_4way( void *state, const void *input );
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This script is not intended for users, it is only used for compile testing
|
||||
# during develpment. Howver the information contained my provide cimpilation
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
make distclean || echo clean
|
||||
|
@@ -1,86 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This script is not intended for users, it is only used for compile testing
|
||||
# during develpment. Howver the information contained my provide cimpilation
|
||||
# tips to users.
|
||||
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-avx512.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-avx2.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx2
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-avx.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-aes-avx
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-sse42.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-aes-sse42
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7 -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-sse42.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-sse42
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=core2 -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-ssse3.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-ssse3
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-sse2.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-sse2
|
||||
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-zen.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-zen
|
||||
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
strip -s cpuminer
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.0.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.1.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.10.0'
|
||||
PACKAGE_STRING='cpuminer-opt 3.10.0'
|
||||
PACKAGE_VERSION='3.10.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.10.1'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.10.0 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.10.1 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.10.0:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.10.1:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.10.0
|
||||
cpuminer-opt configure 3.10.1
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.10.0, which was
|
||||
It was created by cpuminer-opt $as_me 3.10.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.10.0'
|
||||
VERSION='3.10.1'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.10.0, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.10.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.10.0
|
||||
cpuminer-opt config.status 3.10.1
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.10.0])
|
||||
AC_INIT([cpuminer-opt], [3.10.1])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
65
cpu-miner.c
65
cpu-miner.c
@@ -3327,7 +3327,7 @@ static void show_credits()
|
||||
{
|
||||
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
|
||||
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
|
||||
printf(" with AES_NI and AVX2 and SHA extensions.\n");
|
||||
printf(" with AES_NI and AVX2, AVX512 and SHA extensions.\n");
|
||||
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
|
||||
}
|
||||
|
||||
@@ -3343,6 +3343,7 @@ bool check_cpu_capability ()
|
||||
bool cpu_has_avx512 = has_avx512();
|
||||
bool cpu_has_vaes = has_vaes();
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse2 = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
@@ -3369,6 +3370,9 @@ bool check_cpu_capability ()
|
||||
#ifdef __AES__
|
||||
sw_has_aes = true;
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
sw_has_sse2 = true;
|
||||
#endif
|
||||
#ifdef __SSE4_2__
|
||||
sw_has_sse42 = true;
|
||||
#endif
|
||||
@@ -3407,36 +3411,36 @@ bool check_cpu_capability ()
|
||||
#endif
|
||||
|
||||
printf("CPU features:");
|
||||
if ( cpu_has_sse2 ) printf( " SSE2" );
|
||||
if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( cpu_has_avx ) printf( " AVX" );
|
||||
if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||
else if ( cpu_has_avx ) printf( " AVX" );
|
||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2" );
|
||||
|
||||
printf(".\nSW features: SSE2");
|
||||
if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( sw_has_avx ) printf( " AVX" );
|
||||
if ( sw_has_avx2 ) printf( " AVX2" );
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
|
||||
printf(".\nSW features:");
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha ) printf( " SHA" );
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
else if ( sw_has_avx2 ) printf( " AVX2" );
|
||||
else if ( sw_has_avx ) printf( " AVX" );
|
||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( sw_has_sse2 ) printf( " SSE2" );
|
||||
|
||||
printf(".\nAlgo features:");
|
||||
if ( algo_features == EMPTY_SET ) printf( " None" );
|
||||
else
|
||||
{
|
||||
if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
if ( algo_has_vaes ) printf( " VAES" );
|
||||
if ( algo_has_vaes ) printf( " VAES" );
|
||||
else if ( algo_has_aes ) printf( " AES" );
|
||||
if ( algo_has_sha ) printf( " SHA" );
|
||||
if ( algo_has_avx512 ) printf( " AVX512" );
|
||||
else if ( algo_has_avx2 ) printf( " AVX2" );
|
||||
else if ( algo_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( algo_has_sse2 ) printf( " SSE2" );
|
||||
}
|
||||
printf(".\n");
|
||||
|
||||
@@ -3483,12 +3487,13 @@ bool check_cpu_capability ()
|
||||
if ( use_none ) printf( " no optimizations" );
|
||||
else
|
||||
{
|
||||
if ( use_aes ) printf( " AES" );
|
||||
if ( use_vaes ) printf( " VAES" );
|
||||
else if ( use_aes ) printf( " AES" );
|
||||
if ( use_avx512 ) printf( " AVX512" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_sse42 ) printf( " SSE4.2" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
else if ( use_avx2 ) printf( " AVX2" );
|
||||
else if ( use_sse42 ) printf( " SSE4.2" );
|
||||
else if ( use_sse2 ) printf( " SSE2" );
|
||||
if ( use_sha ) printf( " SHA" );
|
||||
}
|
||||
printf( ".\n\n" );
|
||||
|
||||
|
@@ -1991,7 +1991,7 @@ static inline void rintrlv_4x64_4x32( void *dst, const void *src,
|
||||
d[ 0] = s[ 0]; d[ 1] = s[ 2]; d[ 2] = s[ 4]; d[ 3] = s[ 6]; \
|
||||
d[ 4] = s[ 8]; d[ 5] = s[10]; d[ 6] = s[12]; d[ 7] = s[14]; \
|
||||
d[ 8] = s[ 1]; d[ 9] = s[ 3]; d[10] = s[ 5]; d[11] = s[ 7]; \
|
||||
d[12] = s[ 9]; d[13] = s[11]; d[14] = s[13]; d[16] = s[15]; \
|
||||
d[12] = s[ 9]; d[13] = s[11]; d[14] = s[13]; d[15] = s[15]; \
|
||||
} while(0)
|
||||
|
||||
|
||||
@@ -2002,47 +2002,18 @@ static inline void rintrlv_8x64_8x32( void *dst, const void *src,
|
||||
{
|
||||
RLEAVE_8x64_8x32( 0 ); RLEAVE_8x64_8x32( 16 );
|
||||
RLEAVE_8x64_8x32( 32 ); RLEAVE_8x64_8x32( 48 );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
RLEAVE_8x64_8x32( 64 ); RLEAVE_8x64_8x32( 80 );
|
||||
RLEAVE_8x64_8x32( 96 ); RLEAVE_8x64_8x32( 112 );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
RLEAVE_8x64_8x32( 128 ); RLEAVE_8x64_8x32( 144 );
|
||||
RLEAVE_8x64_8x32( 160 ); RLEAVE_8x64_8x32( 176 );
|
||||
RLEAVE_8x64_8x32( 192 ); RLEAVE_8x64_8x32( 208 );
|
||||
RLEAVE_8x64_8x32( 224 ); RLEAVE_8x64_8x32( 240 );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
RLEAVE_8x64_8x32( 256 ); RLEAVE_8x64_8x32( 272 );
|
||||
RLEAVE_8x64_8x32( 288 ); RLEAVE_8x64_8x32( 304 );
|
||||
RLEAVE_8x64_8x32( 320 ); RLEAVE_8x64_8x32( 336 );
|
||||
RLEAVE_8x64_8x32( 352 ); RLEAVE_8x64_8x32( 368 );
|
||||
|
||||
RLEAVE_8x64_8x32( 384 ); RLEAVE_8x64_8x32( 400 );
|
||||
RLEAVE_8x64_8x32( 416 ); RLEAVE_8x64_8x32( 432 );
|
||||
RLEAVE_8x64_8x32( 448 ); RLEAVE_8x64_8x32( 464 );
|
||||
RLEAVE_8x64_8x32( 480 ); RLEAVE_8x64_8x32( 496 );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
RLEAVE_8x64_8x32( 512 ); RLEAVE_8x64_8x32( 528 );
|
||||
RLEAVE_8x64_8x32( 544 ); RLEAVE_8x64_8x32( 560 );
|
||||
RLEAVE_8x64_8x32( 576 ); RLEAVE_8x64_8x32( 592 );
|
||||
RLEAVE_8x64_8x32( 608 ); RLEAVE_8x64_8x32( 624 );
|
||||
|
||||
RLEAVE_8x64_8x32( 640 ); RLEAVE_8x64_8x32( 656 );
|
||||
RLEAVE_8x64_8x32( 672 ); RLEAVE_8x64_8x32( 688 );
|
||||
RLEAVE_8x64_8x32( 704 ); RLEAVE_8x64_8x32( 720 );
|
||||
RLEAVE_8x64_8x32( 736 ); RLEAVE_8x64_8x32( 752 );
|
||||
|
||||
RLEAVE_8x64_8x32( 768 ); RLEAVE_8x64_8x32( 784 );
|
||||
RLEAVE_8x64_8x32( 800 ); RLEAVE_8x64_8x32( 816 );
|
||||
RLEAVE_8x64_8x32( 832 ); RLEAVE_8x64_8x32( 848 );
|
||||
RLEAVE_8x64_8x32( 864 ); RLEAVE_8x64_8x32( 880 );
|
||||
|
||||
RLEAVE_8x64_8x32( 896 ); RLEAVE_8x64_8x32( 912 );
|
||||
RLEAVE_8x64_8x32( 928 ); RLEAVE_8x64_8x32( 944 );
|
||||
RLEAVE_8x64_8x32( 960 ); RLEAVE_8x64_8x32( 976 );
|
||||
RLEAVE_8x64_8x32( 992 ); RLEAVE_8x64_8x32(1008 );
|
||||
}
|
||||
|
||||
#undef RLEAVE_8x64_8x32
|
||||
@@ -2308,5 +2279,17 @@ do { \
|
||||
#define mm512_intrlv_blend_32( hi, lo ) \
|
||||
_mm512_mask_blend_epi32( 0x5555, hi, lo )
|
||||
|
||||
#define mm512_blend_hash_8x64( dst, a, b, mask ) \
|
||||
do { \
|
||||
dst[0] = _mm512_mask_blend_epi64( mask, a[0], b[0] ); \
|
||||
dst[1] = _mm512_mask_blend_epi64( mask, a[1], b[1] ); \
|
||||
dst[2] = _mm512_mask_blend_epi64( mask, a[2], b[2] ); \
|
||||
dst[3] = _mm512_mask_blend_epi64( mask, a[3], b[3] ); \
|
||||
dst[4] = _mm512_mask_blend_epi64( mask, a[4], b[4] ); \
|
||||
dst[5] = _mm512_mask_blend_epi64( mask, a[5], b[5] ); \
|
||||
dst[6] = _mm512_mask_blend_epi64( mask, a[6], b[6] ); \
|
||||
dst[7] = _mm512_mask_blend_epi64( mask, a[7], b[7] ); \
|
||||
} while(0)
|
||||
|
||||
#endif // AVX512
|
||||
#endif // INTERLEAVE_H__
|
||||
|
@@ -162,9 +162,10 @@ static inline __m128i mm128_neg1_fn()
|
||||
|
||||
#define mm128_allbits0( a ) _mm_testz_si128( a, a )
|
||||
#define mm128_allbits1( a ) _mm_testc_si128( a, m128_neg1 )
|
||||
#define mm128_allbitsne( a ) _mm_testnzc_si128( a, m128_neg1 )
|
||||
#define mm128_anybits0 mm128_allbitsne
|
||||
#define mm128_anybits1 mm128_allbitsne
|
||||
// probably broken, avx2 is
|
||||
//#define mm128_allbitsne( a ) _mm_testnzc_si128( a, m128_neg1 )
|
||||
#define mm128_anybits0( a ) mm128_allbits1( a )
|
||||
#define mm128_anybits1( a ) mm128_allbits0( a )
|
||||
|
||||
#else // SSE2
|
||||
|
||||
|
@@ -123,9 +123,10 @@ do { \
|
||||
|
||||
#define mm256_allbits0( a ) _mm256_testz_si256( a, a )
|
||||
#define mm256_allbits1( a ) _mm256_testc_si256( a, m256_neg1 )
|
||||
#define mm256_allbitsne( a ) _mm256_testnzc_si256( a, m256_neg1 )
|
||||
#define mm256_anybits0 mm256_allbitsne
|
||||
#define mm256_anybits1 mm256_allbitsne
|
||||
//broken
|
||||
//#define mm256_allbitsne( a ) _mm256_testnzc_si256( a, m256_neg1 )
|
||||
#define mm256_anybits0( a ) !mm256_allbits1( a )
|
||||
#define mm256_anybits1( a ) !mm256_allbits0( a )
|
||||
|
||||
|
||||
// Parallel AES, for when x is expected to be in a 256 bit register.
|
||||
|
@@ -110,10 +110,12 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
m128_const_64( i1, i0 ) ), 0x44 )
|
||||
|
||||
// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
|
||||
#define m512_const1_256( i ) _mm512_broadcast_i64x4( i )
|
||||
#define m512_const1_128( i ) _mm512_broadcast_i64x2( i )
|
||||
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m512_const1_8 ( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
|
||||
|
||||
//
|
||||
@@ -277,8 +279,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
_mm512_shuffle_epi8( v, \
|
||||
m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213 ) )
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
|
||||
|
||||
#define mm512_bswap_16( v ) \
|
||||
_mm512_shuffle_epi8( v, \
|
||||
@@ -415,38 +417,49 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_rol1x64_256( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
|
||||
/* Need to fix
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
#define mm512_ror1x32_256( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const4_64( \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
0x000000080000000f, 0x0000000e0000000d, \
|
||||
0x0000000c0000000b, 0x0000000a00000009, \
|
||||
0x0000000000000007, 0x0000000600000005, \
|
||||
0x0000000400000003, 0x0000000200000001 ), v )
|
||||
|
||||
#define mm512_rol1x32_256( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const4_64( \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
0x0000000e0000000d, 0x0000000c0000000b, \
|
||||
0x0000000a00000009, 0x000000080000000f, \
|
||||
0x0000000600000005, 0x0000000400000003, \
|
||||
0x0000000200000001, 0x0000000000000007 ), v )
|
||||
|
||||
#define mm512_ror1x16_256( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const4_64( \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x00100001001e001d, 0x001c001b001a0019, \
|
||||
0x0018001700160015, 0x0014001300120011, \
|
||||
0x0000000f000e000d, 0x000c000b000a0009, \
|
||||
0x0008000700060005, 0x0004000300020001 ), v )
|
||||
|
||||
#define mm512_rol1x16_256( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const4_64( \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001e001d001c001b, 0x001a001900180017, \
|
||||
0x0016001500140013, 0x001200110010001f, \
|
||||
0x000e000d000c000b, 0x000a000900080007, \
|
||||
0x0006000500040003, 0x000200010000000f ), v )
|
||||
|
||||
#define mm512_ror1x8_256( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const4_64( \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x203f3e3d3c3b3a39, 0x3837363534333231, \
|
||||
0x302f2e2d2c2b2a29, 0x2827262524232221, \
|
||||
0x001f1e1d1c1b1a19, 0x1817161514131211, \
|
||||
0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
|
||||
|
||||
#define mm512_rol1x8_256( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const4_64( \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x3e3d3c3b3a393837, 0x363534333231302f, \
|
||||
0x2e2d2c2b2a292827, 0x262524232221203f, \
|
||||
0x1e1d1c1b1a191817, 0x161514131211100f, \
|
||||
0x0e0d0c0b0a090807, 0x060504030201001f ), v )
|
||||
*/
|
||||
|
||||
//
|
||||
// Rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
@@ -457,23 +470,33 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
/*
|
||||
#define mm512_ror1x16_128( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const2_64( \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x0018001f001e001d, 0x001c001b001a0019, \
|
||||
0x0010001700160015, 0x0014001300120011, \
|
||||
0x0008000f000e000d, 0x000c000b000a0009, \
|
||||
0x0000000700060005, 0x0004000300020001 ), v )
|
||||
|
||||
#define mm512_rol1x16_128( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const2_64( \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001e001d001c001b, 0x001a00190018001f, \
|
||||
0x0016001500140013, 0x0012001100100017, \
|
||||
0x000e000d000c000b, 0x000a00090008000f, \
|
||||
0x0006000500040003, 0x0002000100000007 ), v )
|
||||
|
||||
#define mm512_ror1x8_128( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const2_64( \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x303f3e3d3c3b3a39, 0x3837363534333231, \
|
||||
0x202f2e2d2c2b2a29, 0x2827262524232221, \
|
||||
0x101f1e1d1c1b1a19, 0x1817161514131211, \
|
||||
0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
|
||||
|
||||
#define mm512_rol1x8_128( v ) \
|
||||
_mm512_shuffle_epi8( v, m512_const2_64( \
|
||||
_mm512_shuffle_epi8( v, m512_const_64( \
|
||||
0x3e3d3c3b3a393837, 0x363534333231303f, \
|
||||
0x2e2d2c2b2a292827, 0x262524232221202f, \
|
||||
0x1e1d1c1b1a191817, 0x161514131211101f, \
|
||||
0x0e0d0c0b0a090807, 0x060504030201000f ) )
|
||||
*/
|
||||
|
||||
// Rotate 128 bit lanes by c bytes.
|
||||
#define mm512_bror_128( v, c ) \
|
||||
|
@@ -11,10 +11,15 @@
|
||||
|
||||
export LOCAL_LIB="$HOME/usr/lib"
|
||||
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
|
||||
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||
|
||||
export MINGW_LIB="/usr/x86_64-w64-mingw32/lib"
|
||||
|
||||
export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/7.3-win32"
|
||||
|
||||
# used by GCC
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
|
||||
# make link to local gmp header file.
|
||||
ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
|
||||
|
||||
@@ -26,13 +31,19 @@ mkdir release
|
||||
cp README.txt release/
|
||||
cp README.md release/
|
||||
cp RELEASE_NOTES release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
|
||||
cp $MINGW_LIB/zlib1.dll release/
|
||||
cp $MINGW_LIB/libwinpthread-1.dll release/
|
||||
cp $GCC_MINGW_LIB/libstdc++-6.dll release/
|
||||
cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
|
||||
#cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||
#cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||
#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
|
||||
#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
|
||||
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
# Start building...
|
||||
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
@@ -1,103 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Script for building Windows binaries release package using mingw.
|
||||
# Requires a custom mingw environment, not intended for users.
|
||||
#
|
||||
# Compiles Windows EXE files for selected CPU architectures, copies them
|
||||
# as well as some DLLs that aren't available in most Windows environments
|
||||
# into a release folder ready to be zipped and uploaded.
|
||||
|
||||
# define some local variables
|
||||
|
||||
export LOCAL_LIB="$HOME/usr/lib"
|
||||
|
||||
export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl"
|
||||
|
||||
export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/openssl --host=x86_64-w64-mingw32"
|
||||
|
||||
# make link to local gmp header file.
|
||||
ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
|
||||
|
||||
# edit configure to fix pthread lib name for Windows.
|
||||
#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
|
||||
|
||||
# make release directory and copy selected DLLs.
|
||||
mkdir release
|
||||
cp README.txt release/
|
||||
cp README.md release/
|
||||
cp RELEASE_NOTES release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
|
||||
cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
|
||||
cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
|
||||
cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
|
||||
cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-zen.exe
|
||||
|
||||
#make clean || echo clean
|
||||
#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $CONFIGURE_ARGS
|
||||
#make
|
||||
#strip -s cpuminer.exe
|
||||
#mv cpuminer.exe release/cpuminer-avx-sha.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2.exe
|
||||
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
|
||||
#make -j
|
||||
#strip -s cpuminer.exe
|
||||
#mv cpuminer.exe release/cpuminer-aes-sha.exe
|
||||
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx.exe
|
||||
|
||||
# -march=westmere is supported in gcc5
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=westmere -Wall" ./configure $CONFIGURE_ARGS
|
||||
#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-aes-sse42.exe
|
||||
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS
|
||||
#make
|
||||
#strip -s cpuminer.exe
|
||||
#mv cpuminer.exe release/cpuminer-sse42.exe
|
||||
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS
|
||||
#make
|
||||
#strip -s cpuminer.exe
|
||||
#mv cpuminer.exe release/cpuminer-ssse3.exe
|
||||
#make clean || echo clean
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-sse2.exe
|
||||
make clean || echo clean
|
||||
|
Reference in New Issue
Block a user