mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
a17ff6f189 |
12
README.txt
12
README.txt
@@ -23,12 +23,20 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
|||||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||||
|
|
||||||
|
More information for Intel and AMD CPU architectures and their features
|
||||||
|
can be found on Wikipedia.
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
|
||||||
|
|
||||||
|
|
||||||
Exe name Compile flags Arch name
|
Exe name Compile flags Arch name
|
||||||
|
|
||||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
cpuminer-avx.exe "-march=corei7-avx" Sandybridge
|
||||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Sky-Kaby-Coffeelake
|
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Skylake, Coffeelake
|
||||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||||
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
|
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
|
||||||
|
|
||||||
|
@@ -25,19 +25,23 @@ Requirements
|
|||||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||||
supported.
|
supported.
|
||||||
|
|
||||||
64 bit Linux or Windows operating system. Apple and Android are not supported.
|
64 bit Linux or Windows operating system. Apple, Android and Rpi are
|
||||||
FreeBSD YMMV.
|
not supported. FreeBSD YMMV.
|
||||||
|
|
||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v3.10.2
|
||||||
|
|
||||||
|
AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
|
||||||
|
Fixed c11 AVX2 invalid shares.
|
||||||
|
|
||||||
v3.10.1
|
v3.10.1
|
||||||
|
|
||||||
AVX512 for blake2b, nist5, quark, tribus.
|
AVX512 for blake2b, nist5, quark, tribus.
|
||||||
|
|
||||||
More broken lane fixes.
|
More broken lane fixes, fixed buffer overflow in skein AVX512, fixed
|
||||||
|
quark invalid shares AVX2.
|
||||||
Fixed buffer overflow in skein AVX512.
|
|
||||||
|
|
||||||
Only the highest ranking feature in a class is listed at startup, lower ranking
|
Only the highest ranking feature in a class is listed at startup, lower ranking
|
||||||
features are available but no longer listed.
|
features are available but no longer listed.
|
||||||
|
@@ -127,7 +127,7 @@ typedef struct {
|
|||||||
|
|
||||||
typedef blake_4way_big_context blake512_4way_context;
|
typedef blake_4way_big_context blake512_4way_context;
|
||||||
|
|
||||||
void blake512_4way_init( void *cc );
|
void blake512_4way_init( blake_4way_big_context *sc );
|
||||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||||
#define blake512_4way blake512_4way_update
|
#define blake512_4way blake512_4way_update
|
||||||
void blake512_4way_close( void *cc, void *dst );
|
void blake512_4way_close( void *cc, void *dst );
|
||||||
@@ -136,6 +136,37 @@ void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
//Blake-256 16 way
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__m512i buf[16];
|
||||||
|
__m512i H[8];
|
||||||
|
size_t ptr;
|
||||||
|
uint32_t T0, T1;
|
||||||
|
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||||
|
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
|
// Default 14 rounds
|
||||||
|
typedef blake_16way_small_context blake256_16way_context;
|
||||||
|
void blake256_16way_init(void *cc);
|
||||||
|
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||||
|
void blake256_16way_close(void *cc, void *dst);
|
||||||
|
|
||||||
|
// 14 rounds, blake, decred
|
||||||
|
typedef blake_16way_small_context blake256r14_16way_context;
|
||||||
|
void blake256r14_16way_init(void *cc);
|
||||||
|
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
||||||
|
void blake256r14_16way_close(void *cc, void *dst);
|
||||||
|
|
||||||
|
// 8 rounds, blakecoin, vanilla
|
||||||
|
typedef blake_16way_small_context blake256r8_16way_context;
|
||||||
|
void blake256r8_16way_init(void *cc);
|
||||||
|
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||||
|
void blake256r8_16way_close(void *cc, void *dst);
|
||||||
|
|
||||||
|
|
||||||
|
// Blake-512 8 way
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
__m512i buf[16];
|
__m512i buf[16];
|
||||||
__m512i H[8];
|
__m512i H[8];
|
||||||
@@ -146,7 +177,7 @@ typedef struct {
|
|||||||
|
|
||||||
typedef blake_8way_big_context blake512_8way_context;
|
typedef blake_8way_big_context blake512_8way_context;
|
||||||
|
|
||||||
void blake512_8way_init( void *cc );
|
void blake512_8way_init( blake_8way_big_context *sc );
|
||||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||||
void blake512_8way_close( void *cc, void *dst );
|
void blake512_8way_close( void *cc, void *dst );
|
||||||
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||||
|
@@ -680,6 +680,144 @@ do { \
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// Blaske-256 16 way AVX512
|
||||||
|
|
||||||
|
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||||
|
do { \
|
||||||
|
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
|
||||||
|
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||||
|
c = _mm512_add_epi32( c, d ); \
|
||||||
|
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||||
|
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||||
|
_mm512_xor_si512( _mm512_set1_epi32( c0 ), m1 ) ); \
|
||||||
|
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||||
|
c = _mm512_add_epi32( c, d ); \
|
||||||
|
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define ROUND_S_16WAY(r) do { \
|
||||||
|
GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||||
|
GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||||
|
GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||||
|
GS_16WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||||
|
GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||||
|
GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||||
|
GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||||
|
GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define DECL_STATE32_16WAY \
|
||||||
|
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||||
|
sph_u32 T0, T1;
|
||||||
|
|
||||||
|
#define READ_STATE32_16WAY(state) \
|
||||||
|
do { \
|
||||||
|
H0 = (state)->H[0]; \
|
||||||
|
H1 = (state)->H[1]; \
|
||||||
|
H2 = (state)->H[2]; \
|
||||||
|
H3 = (state)->H[3]; \
|
||||||
|
H4 = (state)->H[4]; \
|
||||||
|
H5 = (state)->H[5]; \
|
||||||
|
H6 = (state)->H[6]; \
|
||||||
|
H7 = (state)->H[7]; \
|
||||||
|
T0 = (state)->T0; \
|
||||||
|
T1 = (state)->T1; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define WRITE_STATE32_16WAY(state) \
|
||||||
|
do { \
|
||||||
|
(state)->H[0] = H0; \
|
||||||
|
(state)->H[1] = H1; \
|
||||||
|
(state)->H[2] = H2; \
|
||||||
|
(state)->H[3] = H3; \
|
||||||
|
(state)->H[4] = H4; \
|
||||||
|
(state)->H[5] = H5; \
|
||||||
|
(state)->H[6] = H6; \
|
||||||
|
(state)->H[7] = H7; \
|
||||||
|
(state)->T0 = T0; \
|
||||||
|
(state)->T1 = T1; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define COMPRESS32_16WAY( rounds ) \
|
||||||
|
do { \
|
||||||
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||||
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||||
|
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||||
|
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||||
|
__m512i shuf_bswap32; \
|
||||||
|
V0 = H0; \
|
||||||
|
V1 = H1; \
|
||||||
|
V2 = H2; \
|
||||||
|
V3 = H3; \
|
||||||
|
V4 = H4; \
|
||||||
|
V5 = H5; \
|
||||||
|
V6 = H6; \
|
||||||
|
V7 = H7; \
|
||||||
|
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
|
||||||
|
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
||||||
|
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
||||||
|
VB = m512_const1_64( 0x0370734403707344 ); \
|
||||||
|
VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
||||||
|
m512_const1_64( 0xA4093822A4093822 ) ); \
|
||||||
|
VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
||||||
|
m512_const1_64( 0x299F31D0299F31D0 ) ); \
|
||||||
|
VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
||||||
|
m512_const1_64( 0x082EFA98082EFA98 ) ); \
|
||||||
|
VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
||||||
|
m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
||||||
|
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||||
|
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||||
|
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||||
|
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||||
|
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||||
|
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||||
|
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||||
|
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
||||||
|
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
||||||
|
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
||||||
|
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
||||||
|
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
||||||
|
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
||||||
|
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
||||||
|
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
||||||
|
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
||||||
|
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
||||||
|
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
||||||
|
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
||||||
|
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
||||||
|
ROUND_S_16WAY(0); \
|
||||||
|
ROUND_S_16WAY(1); \
|
||||||
|
ROUND_S_16WAY(2); \
|
||||||
|
ROUND_S_16WAY(3); \
|
||||||
|
ROUND_S_16WAY(4); \
|
||||||
|
ROUND_S_16WAY(5); \
|
||||||
|
ROUND_S_16WAY(6); \
|
||||||
|
ROUND_S_16WAY(7); \
|
||||||
|
if (rounds == 14) \
|
||||||
|
{ \
|
||||||
|
ROUND_S_16WAY(8); \
|
||||||
|
ROUND_S_16WAY(9); \
|
||||||
|
ROUND_S_16WAY(0); \
|
||||||
|
ROUND_S_16WAY(1); \
|
||||||
|
ROUND_S_16WAY(2); \
|
||||||
|
ROUND_S_16WAY(3); \
|
||||||
|
} \
|
||||||
|
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
|
||||||
|
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
|
||||||
|
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
|
||||||
|
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
|
||||||
|
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
|
||||||
|
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
|
||||||
|
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
|
||||||
|
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Blake-256 4 way
|
// Blake-256 4 way
|
||||||
@@ -916,6 +1054,179 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
//Blake-256 16 way AVX512
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
|
||||||
|
const sph_u32 *salt, int rounds )
|
||||||
|
{
|
||||||
|
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
|
||||||
|
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||||
|
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||||
|
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||||
|
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
|
||||||
|
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
|
||||||
|
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||||
|
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||||
|
sc->T0 = sc->T1 = 0;
|
||||||
|
sc->ptr = 0;
|
||||||
|
sc->rounds = rounds;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
||||||
|
{
|
||||||
|
__m512i *vdata = (__m512i*)data;
|
||||||
|
__m512i *buf;
|
||||||
|
size_t ptr;
|
||||||
|
const int buf_size = 64; // number of elements, sizeof/4
|
||||||
|
DECL_STATE32_16WAY
|
||||||
|
buf = sc->buf;
|
||||||
|
ptr = sc->ptr;
|
||||||
|
if ( len < buf_size - ptr )
|
||||||
|
{
|
||||||
|
memcpy_512( buf + (ptr>>2), vdata, len>>2 );
|
||||||
|
ptr += len;
|
||||||
|
sc->ptr = ptr;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
READ_STATE32_16WAY(sc);
|
||||||
|
while ( len > 0 )
|
||||||
|
{
|
||||||
|
size_t clen;
|
||||||
|
|
||||||
|
clen = buf_size - ptr;
|
||||||
|
if (clen > len)
|
||||||
|
clen = len;
|
||||||
|
memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
|
||||||
|
ptr += clen;
|
||||||
|
vdata += (clen>>2);
|
||||||
|
len -= clen;
|
||||||
|
if ( ptr == buf_size )
|
||||||
|
{
|
||||||
|
if ( ( T0 = T0 + 512 ) < 512 )
|
||||||
|
T1 = T1 + 1;
|
||||||
|
COMPRESS32_16WAY( sc->rounds );
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WRITE_STATE32_16WAY(sc);
|
||||||
|
sc->ptr = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||||
|
void *dst, size_t out_size_w32 )
|
||||||
|
{
|
||||||
|
__m512i buf[16];
|
||||||
|
size_t ptr;
|
||||||
|
unsigned bit_len;
|
||||||
|
sph_u32 th, tl;
|
||||||
|
|
||||||
|
ptr = sc->ptr;
|
||||||
|
bit_len = ((unsigned)ptr << 3);
|
||||||
|
buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
|
||||||
|
tl = sc->T0 + bit_len;
|
||||||
|
th = sc->T1;
|
||||||
|
|
||||||
|
if ( ptr == 0 )
|
||||||
|
{
|
||||||
|
sc->T0 = 0xFFFFFE00UL;
|
||||||
|
sc->T1 = 0xFFFFFFFFUL;
|
||||||
|
}
|
||||||
|
else if ( sc->T0 == 0 )
|
||||||
|
{
|
||||||
|
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||||
|
sc->T1 = sc->T1 - 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
sc->T0 -= 512 - bit_len;
|
||||||
|
|
||||||
|
if ( ptr <= 52 )
|
||||||
|
{
|
||||||
|
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||||
|
if ( out_size_w32 == 8 )
|
||||||
|
buf[52>>2] = _mm512_or_si512( buf[52>>2],
|
||||||
|
m512_const1_64( 0x0100000001000000ULL ) );
|
||||||
|
buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
||||||
|
buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
||||||
|
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||||
|
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||||
|
sc->T0 = 0xFFFFFE00UL;
|
||||||
|
sc->T1 = 0xFFFFFFFFUL;
|
||||||
|
memset_zero_512( buf, 56>>2 );
|
||||||
|
if ( out_size_w32 == 8 )
|
||||||
|
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
|
||||||
|
buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
||||||
|
buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
||||||
|
blake32_16way( sc, buf, 64 );
|
||||||
|
}
|
||||||
|
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256_16way_init(void *cc)
|
||||||
|
{
|
||||||
|
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256_16way_update(void *cc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
blake32_16way(cc, data, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256_16way_close_update(void *cc, void *dst)
|
||||||
|
{
|
||||||
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
void blake256r14_16way_init(void *cc)
|
||||||
|
{
|
||||||
|
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256r14_16way_update(void *cc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
blake32_16way(cc, data, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256r14_16way_close(void *cc, void *dst)
|
||||||
|
{
|
||||||
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
void blake256r8_16way_init(void *cc)
|
||||||
|
{
|
||||||
|
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256r8_16way_update(void *cc, const void *data, size_t len)
|
||||||
|
{
|
||||||
|
blake32_16way(cc, data, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
blake256r8_16way_close(void *cc, void *dst)
|
||||||
|
{
|
||||||
|
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Blake-256 4 way
|
// Blake-256 4 way
|
||||||
|
|
||||||
// default 14 rounds, backward copatibility
|
// default 14 rounds, backward copatibility
|
||||||
|
@@ -42,20 +42,13 @@
|
|||||||
extern "C"{
|
extern "C"{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
|
|
||||||
#define SPH_SMALL_FOOTPRINT_BLAKE 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
|
|
||||||
#define SPH_COMPACT_BLAKE_64 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#pragma warning (disable: 4146)
|
#pragma warning (disable: 4146)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Blake-512
|
// Blake-512 common
|
||||||
|
|
||||||
|
/*
|
||||||
static const sph_u64 IV512[8] = {
|
static const sph_u64 IV512[8] = {
|
||||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||||
@@ -65,10 +58,6 @@ static const sph_u64 IV512[8] = {
|
|||||||
|
|
||||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||||
|
|
||||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
|
||||||
|
|
||||||
// Blake-256 4 & 8 way, Blake-512 4 way
|
|
||||||
|
|
||||||
static const unsigned sigma[16][16] = {
|
static const unsigned sigma[16][16] = {
|
||||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||||
@@ -88,7 +77,17 @@ static const unsigned sigma[16][16] = {
|
|||||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
static const sph_u64 CB[16] = {
|
||||||
|
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||||
|
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||||
|
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||||
|
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||||
|
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||||
|
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||||
|
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||||
|
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
#define Z00 0
|
#define Z00 0
|
||||||
#define Z01 1
|
#define Z01 1
|
||||||
@@ -285,23 +284,6 @@ static const unsigned sigma[16][16] = {
|
|||||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||||
#define CBF SPH_C64(0x636920D871574E69)
|
#define CBF SPH_C64(0x636920D871574E69)
|
||||||
|
|
||||||
/*
|
|
||||||
#if SPH_COMPACT_BLAKE_64
|
|
||||||
// not used
|
|
||||||
static const sph_u64 CB[16] = {
|
|
||||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
|
||||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
|
||||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
|
||||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
|
||||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
|
||||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
|
||||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
|
||||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define READ_STATE64(state) do { \
|
#define READ_STATE64(state) do { \
|
||||||
H0 = (state)->H[0]; \
|
H0 = (state)->H[0]; \
|
||||||
H1 = (state)->H[1]; \
|
H1 = (state)->H[1]; \
|
||||||
@@ -338,7 +320,7 @@ static const sph_u64 CB[16] = {
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
// Blake-512 8 way
|
// Blake-512 8 way AVX512
|
||||||
|
|
||||||
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||||
@@ -364,7 +346,6 @@ static const sph_u64 CB[16] = {
|
|||||||
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
#define DECL_STATE64_8WAY \
|
#define DECL_STATE64_8WAY \
|
||||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||||
__m512i S0, S1, S2, S3; \
|
__m512i S0, S1, S2, S3; \
|
||||||
@@ -443,9 +424,7 @@ static const sph_u64 CB[16] = {
|
|||||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
static void
|
void blake512_8way_init( blake_8way_big_context *sc )
|
||||||
blake64_8way_init( blake_8way_big_context *sc, const sph_u64 *iv,
|
|
||||||
const sph_u64 *salt )
|
|
||||||
{
|
{
|
||||||
__m512i zero = m512_zero;
|
__m512i zero = m512_zero;
|
||||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||||
@@ -511,20 +490,20 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake64_8way_close( blake_8way_big_context *sc,
|
blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
|
||||||
{
|
{
|
||||||
__m512i buf[16];
|
__m512i buf[16];
|
||||||
size_t ptr;
|
size_t ptr;
|
||||||
unsigned bit_len;
|
unsigned bit_len;
|
||||||
uint64_t z, zz;
|
// uint64_t z, zz;
|
||||||
sph_u64 th, tl;
|
sph_u64 th, tl;
|
||||||
|
|
||||||
ptr = sc->ptr;
|
ptr = sc->ptr;
|
||||||
bit_len = ((unsigned)ptr << 3);
|
bit_len = ((unsigned)ptr << 3);
|
||||||
z = 0x80 >> n;
|
// z = 0x80 >> n;
|
||||||
zz = ((ub & -z) | z) & 0xFF;
|
// zz = ((ub & -z) | z) & 0xFF;
|
||||||
buf[ptr>>3] = _mm512_set1_epi64( zz );
|
// buf[ptr>>3] = _mm512_set1_epi64( zz );
|
||||||
|
buf[ptr>>3] = m512_const1_64( 0x80 );
|
||||||
tl = sc->T0 + bit_len;
|
tl = sc->T0 + bit_len;
|
||||||
th = sc->T1;
|
th = sc->T1;
|
||||||
if (ptr == 0 )
|
if (ptr == 0 )
|
||||||
@@ -544,11 +523,10 @@ blake64_8way_close( blake_8way_big_context *sc,
|
|||||||
if ( ptr <= 104 )
|
if ( ptr <= 104 )
|
||||||
{
|
{
|
||||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||||
if ( out_size_w64 == 8 )
|
buf[104>>3] = _mm512_or_si512( buf[104>>3],
|
||||||
buf[(104>>3)] = _mm512_or_si512( buf[(104>>3)],
|
|
||||||
m512_const1_64( 0x0100000000000000ULL ) );
|
m512_const1_64( 0x0100000000000000ULL ) );
|
||||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||||
|
|
||||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||||
}
|
}
|
||||||
@@ -560,22 +538,15 @@ blake64_8way_close( blake_8way_big_context *sc,
|
|||||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||||
memset_zero_512( buf, 112>>3 );
|
memset_zero_512( buf, 112>>3 );
|
||||||
if ( out_size_w64 == 8 )
|
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
|
||||||
|
|
||||||
blake64_8way( sc, buf, 128 );
|
blake64_8way( sc, buf, 128 );
|
||||||
}
|
}
|
||||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
blake512_8way_init(void *cc)
|
|
||||||
{
|
|
||||||
blake64_8way_init(cc, IV512, salt_zero_big);
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
blake512_8way_update(void *cc, const void *data, size_t len)
|
blake512_8way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
@@ -591,7 +562,7 @@ blake512_8way_close(void *cc, void *dst)
|
|||||||
void
|
void
|
||||||
blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||||
{
|
{
|
||||||
blake64_8way_close(cc, ub, n, dst, 8);
|
blake64_8way_close(cc, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
@@ -698,11 +669,8 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
|||||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
//static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
|
||||||
|
|
||||||
static void
|
void blake512_4way_init( blake_4way_big_context *sc )
|
||||||
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
|
||||||
const sph_u64 *salt )
|
|
||||||
{
|
{
|
||||||
__m256i zero = m256_zero;
|
__m256i zero = m256_zero;
|
||||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||||
@@ -713,12 +681,10 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
|||||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||||
|
|
||||||
casti_m256i( sc->S, 0 ) = zero;
|
casti_m256i( sc->S, 0 ) = zero;
|
||||||
casti_m256i( sc->S, 1 ) = zero;
|
casti_m256i( sc->S, 1 ) = zero;
|
||||||
casti_m256i( sc->S, 2 ) = zero;
|
casti_m256i( sc->S, 2 ) = zero;
|
||||||
casti_m256i( sc->S, 3 ) = zero;
|
casti_m256i( sc->S, 3 ) = zero;
|
||||||
|
|
||||||
sc->T0 = sc->T1 = 0;
|
sc->T0 = sc->T1 = 0;
|
||||||
sc->ptr = 0;
|
sc->ptr = 0;
|
||||||
}
|
}
|
||||||
@@ -768,20 +734,16 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
blake64_4way_close( blake_4way_big_context *sc,
|
blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
|
||||||
{
|
{
|
||||||
__m256i buf[16];
|
__m256i buf[16];
|
||||||
size_t ptr;
|
size_t ptr;
|
||||||
unsigned bit_len;
|
unsigned bit_len;
|
||||||
uint64_t z, zz;
|
|
||||||
sph_u64 th, tl;
|
sph_u64 th, tl;
|
||||||
|
|
||||||
ptr = sc->ptr;
|
ptr = sc->ptr;
|
||||||
bit_len = ((unsigned)ptr << 3);
|
bit_len = ((unsigned)ptr << 3);
|
||||||
z = 0x80 >> n;
|
buf[ptr>>3] = m256_const1_64( 0x80 );
|
||||||
zz = ((ub & -z) | z) & 0xFF;
|
|
||||||
buf[ptr>>3] = _mm256_set1_epi64x( zz );
|
|
||||||
tl = sc->T0 + bit_len;
|
tl = sc->T0 + bit_len;
|
||||||
th = sc->T1;
|
th = sc->T1;
|
||||||
if (ptr == 0 )
|
if (ptr == 0 )
|
||||||
@@ -798,40 +760,41 @@ blake64_4way_close( blake_4way_big_context *sc,
|
|||||||
{
|
{
|
||||||
sc->T0 -= 1024 - bit_len;
|
sc->T0 -= 1024 - bit_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( ptr <= 104 )
|
if ( ptr <= 104 )
|
||||||
{
|
{
|
||||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||||
if ( out_size_w64 == 8 )
|
buf[104>>3] = _mm256_or_si256( buf[104>>3],
|
||||||
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
|
|
||||||
m256_const1_64( 0x0100000000000000ULL ) );
|
m256_const1_64( 0x0100000000000000ULL ) );
|
||||||
*(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
|
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||||
*(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
|
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||||
|
|
||||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||||
|
|
||||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||||
memset_zero_256( buf, 112>>3 );
|
memset_zero_256( buf, 112>>3 );
|
||||||
if ( out_size_w64 == 8 )
|
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
|
||||||
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
|
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||||
*(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
|
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||||
*(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
|
|
||||||
|
|
||||||
blake64_4way( sc, buf, 128 );
|
blake64_4way( sc, buf, 128 );
|
||||||
}
|
}
|
||||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
void
|
void
|
||||||
blake512_4way_init(void *cc)
|
blake512_4way_init(void *cc)
|
||||||
{
|
{
|
||||||
blake64_4way_init(cc, IV512, salt_zero_big);
|
blake64_4way_init(cc, IV512, salt_zero_big);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
void
|
void
|
||||||
blake512_4way_update(void *cc, const void *data, size_t len)
|
blake512_4way_update(void *cc, const void *data, size_t len)
|
||||||
@@ -842,15 +805,18 @@ blake512_4way_update(void *cc, const void *data, size_t len)
|
|||||||
void
|
void
|
||||||
blake512_4way_close(void *cc, void *dst)
|
blake512_4way_close(void *cc, void *dst)
|
||||||
{
|
{
|
||||||
blake512_4way_addbits_and_close(cc, 0, 0, dst);
|
blake64_4way_close( cc, dst );
|
||||||
|
|
||||||
|
// blake512_4way_addbits_and_close(cc, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
void
|
void
|
||||||
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||||
{
|
{
|
||||||
blake64_4way_close(cc, ub, n, dst, 8);
|
blake64_4way_close(cc, ub, n, dst, 8);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@@ -64,7 +64,8 @@ typedef bmw_4way_small_context bmw256_4way_context;
|
|||||||
|
|
||||||
void bmw256_4way_init( bmw256_4way_context *ctx );
|
void bmw256_4way_init( bmw256_4way_context *ctx );
|
||||||
|
|
||||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
void bmw256_4way_update(void *cc, const void *data, size_t len);
|
||||||
|
#define bmw256_4way bmw256_4way_update
|
||||||
|
|
||||||
void bmw256_4way_close(void *cc, void *dst);
|
void bmw256_4way_close(void *cc, void *dst);
|
||||||
|
|
||||||
@@ -87,11 +88,33 @@ typedef struct {
|
|||||||
typedef bmw_8way_small_context bmw256_8way_context;
|
typedef bmw_8way_small_context bmw256_8way_context;
|
||||||
|
|
||||||
void bmw256_8way_init( bmw256_8way_context *ctx );
|
void bmw256_8way_init( bmw256_8way_context *ctx );
|
||||||
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
|
void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
||||||
|
size_t len );
|
||||||
|
#define bmw256_8way bmw256_8way_update
|
||||||
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// BMW-256 16 way 32
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__m512i buf[16];
|
||||||
|
__m512i H[16];
|
||||||
|
size_t ptr;
|
||||||
|
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||||
|
} bmw_16way_small_context __attribute__ ((aligned (128)));
|
||||||
|
|
||||||
|
typedef bmw_16way_small_context bmw256_16way_context;
|
||||||
|
|
||||||
|
void bmw256_16way_init( bmw256_16way_context *ctx );
|
||||||
|
void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
||||||
|
size_t len );
|
||||||
|
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
|
|
||||||
|
@@ -564,7 +564,7 @@ bmw256_4way_init(void *cc)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
void
|
void
|
||||||
bmw256_4way(void *cc, const void *data, size_t len)
|
bmw256_4way_update(void *cc, const void *data, size_t len)
|
||||||
{
|
{
|
||||||
bmw32_4way(cc, data, len);
|
bmw32_4way(cc, data, len);
|
||||||
}
|
}
|
||||||
@@ -1014,7 +1014,8 @@ void bmw256_8way_init( bmw256_8way_context *ctx )
|
|||||||
ctx->bit_count = 0;
|
ctx->bit_count = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
|
void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
||||||
|
size_t len )
|
||||||
{
|
{
|
||||||
__m256i *vdata = (__m256i*)data;
|
__m256i *vdata = (__m256i*)data;
|
||||||
__m256i *buf;
|
__m256i *buf;
|
||||||
@@ -1092,6 +1093,513 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
|
|||||||
|
|
||||||
#endif // __AVX2__
|
#endif // __AVX2__
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// BMW-256 16 way 32
|
||||||
|
|
||||||
|
|
||||||
|
#define s16s0(x) \
|
||||||
|
mm512_xor4( _mm512_srli_epi32( (x), 1), \
|
||||||
|
_mm512_slli_epi32( (x), 3), \
|
||||||
|
mm512_rol_32( (x), 4), \
|
||||||
|
mm512_rol_32( (x), 19) )
|
||||||
|
|
||||||
|
#define s16s1(x) \
|
||||||
|
mm512_xor4( _mm512_srli_epi32( (x), 1), \
|
||||||
|
_mm512_slli_epi32( (x), 2), \
|
||||||
|
mm512_rol_32( (x), 8), \
|
||||||
|
mm512_rol_32( (x), 23) )
|
||||||
|
|
||||||
|
#define s16s2(x) \
|
||||||
|
mm512_xor4( _mm512_srli_epi32( (x), 2), \
|
||||||
|
_mm512_slli_epi32( (x), 1), \
|
||||||
|
mm512_rol_32( (x), 12), \
|
||||||
|
mm512_rol_32( (x), 25) )
|
||||||
|
|
||||||
|
#define s16s3(x) \
|
||||||
|
mm512_xor4( _mm512_srli_epi32( (x), 2), \
|
||||||
|
_mm512_slli_epi32( (x), 2), \
|
||||||
|
mm512_rol_32( (x), 15), \
|
||||||
|
mm512_rol_32( (x), 29) )
|
||||||
|
|
||||||
|
#define s16s4(x) \
|
||||||
|
_mm512_xor_si512( (x), _mm512_srli_epi32( (x), 1 ) )
|
||||||
|
|
||||||
|
#define s16s5(x) \
|
||||||
|
_mm512_xor_si512( (x), _mm512_srli_epi32( (x), 2 ) )
|
||||||
|
|
||||||
|
#define r16s1(x) mm512_rol_32( x, 3 )
|
||||||
|
#define r16s2(x) mm512_rol_32( x, 7 )
|
||||||
|
#define r16s3(x) mm512_rol_32( x, 13 )
|
||||||
|
#define r16s4(x) mm512_rol_32( x, 16 )
|
||||||
|
#define r16s5(x) mm512_rol_32( x, 19 )
|
||||||
|
#define r16s6(x) mm512_rol_32( x, 23 )
|
||||||
|
#define r16s7(x) mm512_rol_32( x, 27 )
|
||||||
|
|
||||||
|
#define mm512_rol_off_32( M, j, off ) \
|
||||||
|
mm512_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||||
|
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||||
|
|
||||||
|
#define add_elt_s16( M, H, j ) \
|
||||||
|
_mm512_xor_si512( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_add_epi32( mm512_rol_off_32( M, j, 0 ), \
|
||||||
|
mm512_rol_off_32( M, j, 3 ) ), \
|
||||||
|
mm512_rol_off_32( M, j, 10 ) ), \
|
||||||
|
_mm512_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
|
||||||
|
H[ ( (j)+7 ) & 0xF ] )
|
||||||
|
|
||||||
|
#define expand1s16( qt, M, H, i ) \
|
||||||
|
_mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
|
||||||
|
mm512_add4_32( mm512_add4_32( s16s1( qt[ (i)-16 ] ), \
|
||||||
|
s16s2( qt[ (i)-15 ] ), \
|
||||||
|
s16s3( qt[ (i)-14 ] ), \
|
||||||
|
s16s0( qt[ (i)-13 ] ) ), \
|
||||||
|
mm512_add4_32( s16s1( qt[ (i)-12 ] ), \
|
||||||
|
s16s2( qt[ (i)-11 ] ), \
|
||||||
|
s16s3( qt[ (i)-10 ] ), \
|
||||||
|
s16s0( qt[ (i)- 9 ] ) ), \
|
||||||
|
mm512_add4_32( s16s1( qt[ (i)- 8 ] ), \
|
||||||
|
s16s2( qt[ (i)- 7 ] ), \
|
||||||
|
s16s3( qt[ (i)- 6 ] ), \
|
||||||
|
s16s0( qt[ (i)- 5 ] ) ), \
|
||||||
|
mm512_add4_32( s16s1( qt[ (i)- 4 ] ), \
|
||||||
|
s16s2( qt[ (i)- 3 ] ), \
|
||||||
|
s16s3( qt[ (i)- 2 ] ), \
|
||||||
|
s16s0( qt[ (i)- 1 ] ) ) ) )
|
||||||
|
|
||||||
|
#define expand2s16( qt, M, H, i) \
|
||||||
|
_mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
|
||||||
|
mm512_add4_32( mm512_add4_32( qt[ (i)-16 ], \
|
||||||
|
r16s1( qt[ (i)-15 ] ), \
|
||||||
|
qt[ (i)-14 ], \
|
||||||
|
r16s2( qt[ (i)-13 ] ) ), \
|
||||||
|
mm512_add4_32( qt[ (i)-12 ], \
|
||||||
|
r16s3( qt[ (i)-11 ] ), \
|
||||||
|
qt[ (i)-10 ], \
|
||||||
|
r16s4( qt[ (i)- 9 ] ) ), \
|
||||||
|
mm512_add4_32( qt[ (i)- 8 ], \
|
||||||
|
r16s5( qt[ (i)- 7 ] ), \
|
||||||
|
qt[ (i)- 6 ], \
|
||||||
|
r16s6( qt[ (i)- 5 ] ) ), \
|
||||||
|
mm512_add4_32( qt[ (i)- 4 ], \
|
||||||
|
r16s7( qt[ (i)- 3 ] ), \
|
||||||
|
s16s4( qt[ (i)- 2 ] ), \
|
||||||
|
s16s5( qt[ (i)- 1 ] ) ) ) )
|
||||||
|
|
||||||
|
|
||||||
|
#define W16s0 \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||||
|
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||||
|
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[13], H[13] ), \
|
||||||
|
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||||
|
|
||||||
|
#define W16s1 \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 6], H[ 6] ), \
|
||||||
|
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||||
|
_mm512_xor_si512( M[11], H[11] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[14], H[14] ), \
|
||||||
|
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||||
|
|
||||||
|
#define W16s2 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||||
|
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||||
|
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||||
|
|
||||||
|
#define W16s3 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||||
|
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[10], H[10] ), \
|
||||||
|
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||||
|
|
||||||
|
#define W16s4 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||||
|
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
|
||||||
|
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||||
|
|
||||||
|
#define W16s5 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||||
|
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||||
|
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||||
|
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||||
|
|
||||||
|
#define W16s6 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 4], H[ 4] ), \
|
||||||
|
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[11], H[11] ), \
|
||||||
|
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||||
|
|
||||||
|
#define W16s7 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||||
|
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||||
|
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||||
|
|
||||||
|
#define W16s8 \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||||
|
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[13], H[13] ), \
|
||||||
|
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||||
|
|
||||||
|
#define W16s9 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||||
|
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||||
|
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||||
|
|
||||||
|
#define W16s10 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||||
|
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||||
|
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||||
|
|
||||||
|
#define W16s11 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||||
|
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||||
|
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
|
||||||
|
|
||||||
|
#define W16s12 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||||
|
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||||
|
_mm512_xor_si512( M[10], H[10] ) ) )
|
||||||
|
|
||||||
|
#define W16s13 \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||||
|
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[10], H[10] ), \
|
||||||
|
_mm512_xor_si512( M[11], H[11] ) ) )
|
||||||
|
|
||||||
|
#define W16s14 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||||
|
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||||
|
_mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
|
||||||
|
_mm512_xor_si512( M[12], H[12] ) ) )
|
||||||
|
|
||||||
|
#define W16s15 \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||||
|
_mm512_xor_si512( M[ 4], H[4] ) ), \
|
||||||
|
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||||
|
_mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||||
|
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||||
|
|
||||||
|
void compress_small_16way( const __m512i *M, const __m512i H[16],
|
||||||
|
__m512i dH[16] )
|
||||||
|
{
|
||||||
|
__m512i qt[32], xl, xh;
|
||||||
|
|
||||||
|
qt[ 0] = _mm512_add_epi32( s16s0( W16s0 ), H[ 1] );
|
||||||
|
qt[ 1] = _mm512_add_epi32( s16s1( W16s1 ), H[ 2] );
|
||||||
|
qt[ 2] = _mm512_add_epi32( s16s2( W16s2 ), H[ 3] );
|
||||||
|
qt[ 3] = _mm512_add_epi32( s16s3( W16s3 ), H[ 4] );
|
||||||
|
qt[ 4] = _mm512_add_epi32( s16s4( W16s4 ), H[ 5] );
|
||||||
|
qt[ 5] = _mm512_add_epi32( s16s0( W16s5 ), H[ 6] );
|
||||||
|
qt[ 6] = _mm512_add_epi32( s16s1( W16s6 ), H[ 7] );
|
||||||
|
qt[ 7] = _mm512_add_epi32( s16s2( W16s7 ), H[ 8] );
|
||||||
|
qt[ 8] = _mm512_add_epi32( s16s3( W16s8 ), H[ 9] );
|
||||||
|
qt[ 9] = _mm512_add_epi32( s16s4( W16s9 ), H[10] );
|
||||||
|
qt[10] = _mm512_add_epi32( s16s0( W16s10), H[11] );
|
||||||
|
qt[11] = _mm512_add_epi32( s16s1( W16s11), H[12] );
|
||||||
|
qt[12] = _mm512_add_epi32( s16s2( W16s12), H[13] );
|
||||||
|
qt[13] = _mm512_add_epi32( s16s3( W16s13), H[14] );
|
||||||
|
qt[14] = _mm512_add_epi32( s16s4( W16s14), H[15] );
|
||||||
|
qt[15] = _mm512_add_epi32( s16s0( W16s15), H[ 0] );
|
||||||
|
qt[16] = expand1s16( qt, M, H, 16 );
|
||||||
|
qt[17] = expand1s16( qt, M, H, 17 );
|
||||||
|
qt[18] = expand2s16( qt, M, H, 18 );
|
||||||
|
qt[19] = expand2s16( qt, M, H, 19 );
|
||||||
|
qt[20] = expand2s16( qt, M, H, 20 );
|
||||||
|
qt[21] = expand2s16( qt, M, H, 21 );
|
||||||
|
qt[22] = expand2s16( qt, M, H, 22 );
|
||||||
|
qt[23] = expand2s16( qt, M, H, 23 );
|
||||||
|
qt[24] = expand2s16( qt, M, H, 24 );
|
||||||
|
qt[25] = expand2s16( qt, M, H, 25 );
|
||||||
|
qt[26] = expand2s16( qt, M, H, 26 );
|
||||||
|
qt[27] = expand2s16( qt, M, H, 27 );
|
||||||
|
qt[28] = expand2s16( qt, M, H, 28 );
|
||||||
|
qt[29] = expand2s16( qt, M, H, 29 );
|
||||||
|
qt[30] = expand2s16( qt, M, H, 30 );
|
||||||
|
qt[31] = expand2s16( qt, M, H, 31 );
|
||||||
|
|
||||||
|
xl = _mm512_xor_si512(
|
||||||
|
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||||
|
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||||
|
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||||
|
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||||
|
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||||
|
|
||||||
|
#define DH1L( m, sl, sr, a, b, c ) \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_xor_si512( M[m], \
|
||||||
|
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
|
||||||
|
_mm512_srli_epi32( qt[a], sr ) ) ), \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||||
|
|
||||||
|
#define DH1R( m, sl, sr, a, b, c ) \
|
||||||
|
_mm512_add_epi32( \
|
||||||
|
_mm512_xor_si512( M[m], \
|
||||||
|
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
|
||||||
|
_mm512_slli_epi32( qt[a], sr ) ) ), \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||||
|
|
||||||
|
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||||
|
_mm512_add_epi32( _mm512_add_epi32( \
|
||||||
|
mm512_rol_32( dH[h], rl ), \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||||
|
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
|
||||||
|
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||||
|
|
||||||
|
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||||
|
_mm512_add_epi32( _mm512_add_epi32( \
|
||||||
|
mm512_rol_32( dH[h], rl ), \
|
||||||
|
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||||
|
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
|
||||||
|
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||||
|
|
||||||
|
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||||
|
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||||
|
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||||
|
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||||
|
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||||
|
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||||
|
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||||
|
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||||
|
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||||
|
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||||
|
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||||
|
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||||
|
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||||
|
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||||
|
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||||
|
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||||
|
|
||||||
|
#undef DH1L
|
||||||
|
#undef DH1R
|
||||||
|
#undef DH2L
|
||||||
|
#undef DH2R
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static const __m512i final_s16[16] =
|
||||||
|
{
|
||||||
|
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
|
||||||
|
0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
|
||||||
|
0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
|
||||||
|
0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
||||||
|
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
|
||||||
|
0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
|
||||||
|
0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
|
||||||
|
0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
||||||
|
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
|
||||||
|
0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
|
||||||
|
0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
|
||||||
|
0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
|
||||||
|
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
|
||||||
|
0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
|
||||||
|
0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
|
||||||
|
0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
|
||||||
|
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
|
||||||
|
0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
|
||||||
|
0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
|
||||||
|
0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
|
||||||
|
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
|
||||||
|
0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
|
||||||
|
0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
|
||||||
|
0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
|
||||||
|
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
|
||||||
|
0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
|
||||||
|
0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
|
||||||
|
0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
|
||||||
|
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
|
||||||
|
0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
|
||||||
|
0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
|
||||||
|
0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
|
||||||
|
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
|
||||||
|
0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
|
||||||
|
0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
|
||||||
|
0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
|
||||||
|
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
|
||||||
|
0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
|
||||||
|
0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
|
||||||
|
0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
|
||||||
|
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||||
|
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||||
|
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||||
|
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
|
||||||
|
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
|
||||||
|
0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
|
||||||
|
0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
|
||||||
|
0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
|
||||||
|
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
|
||||||
|
0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
|
||||||
|
0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
|
||||||
|
0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
|
||||||
|
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
|
||||||
|
0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
|
||||||
|
0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
|
||||||
|
0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
|
||||||
|
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
|
||||||
|
0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
|
||||||
|
0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
|
||||||
|
0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
|
||||||
|
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
|
||||||
|
0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
|
||||||
|
0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
|
||||||
|
0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
void bmw256_16way_init( bmw256_16way_context *ctx )
|
||||||
|
{
|
||||||
|
ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
|
||||||
|
ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
|
||||||
|
ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
|
||||||
|
ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
|
||||||
|
ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
|
||||||
|
ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
|
||||||
|
ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
|
||||||
|
ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
|
||||||
|
ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
|
||||||
|
ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
|
||||||
|
ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
|
||||||
|
ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
|
||||||
|
ctx->H[12] = m512_const1_64( 0x7071727370717273 );
|
||||||
|
ctx->H[13] = m512_const1_64( 0x7475767774757677 );
|
||||||
|
ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
|
||||||
|
ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
|
||||||
|
ctx->ptr = 0;
|
||||||
|
ctx->bit_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
||||||
|
size_t len )
|
||||||
|
{
|
||||||
|
__m512i *vdata = (__m512i*)data;
|
||||||
|
__m512i *buf;
|
||||||
|
__m512i htmp[16];
|
||||||
|
__m512i *h1, *h2;
|
||||||
|
size_t ptr;
|
||||||
|
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||||
|
|
||||||
|
ctx->bit_count += len << 3;
|
||||||
|
buf = ctx->buf;
|
||||||
|
ptr = ctx->ptr;
|
||||||
|
h1 = ctx->H;
|
||||||
|
h2 = htmp;
|
||||||
|
|
||||||
|
while ( len > 0 )
|
||||||
|
{
|
||||||
|
size_t clen;
|
||||||
|
clen = buf_size - ptr;
|
||||||
|
if ( clen > len )
|
||||||
|
clen = len;
|
||||||
|
memcpy_512( buf + (ptr>>2), vdata, clen >> 2 );
|
||||||
|
vdata = vdata + (clen>>2);
|
||||||
|
len -= clen;
|
||||||
|
ptr += clen;
|
||||||
|
if ( ptr == buf_size )
|
||||||
|
{
|
||||||
|
__m512i *ht;
|
||||||
|
compress_small_16way( buf, h1, h2 );
|
||||||
|
ht = h1;
|
||||||
|
h1 = h2;
|
||||||
|
h2 = ht;
|
||||||
|
ptr = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctx->ptr = ptr;
|
||||||
|
|
||||||
|
if ( h1 != ctx->H )
|
||||||
|
memcpy_512( ctx->H, h1, 16 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )
|
||||||
|
{
|
||||||
|
__m512i *buf;
|
||||||
|
__m512i h1[16], h2[16], *h;
|
||||||
|
size_t ptr, u, v;
|
||||||
|
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||||
|
|
||||||
|
buf = ctx->buf;
|
||||||
|
ptr = ctx->ptr;
|
||||||
|
buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
|
||||||
|
ptr += 4;
|
||||||
|
h = ctx->H;
|
||||||
|
|
||||||
|
if ( ptr > (buf_size - 4) )
|
||||||
|
{
|
||||||
|
memset_zero_512( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||||
|
compress_small_16way( buf, h, h1 );
|
||||||
|
ptr = 0;
|
||||||
|
h = h1;
|
||||||
|
}
|
||||||
|
memset_zero_512( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||||
|
buf[ (buf_size - 8) >> 2 ] = _mm512_set1_epi32( ctx->bit_count );
|
||||||
|
buf[ (buf_size - 4) >> 2 ] = m512_zero;
|
||||||
|
|
||||||
|
compress_small_16way( buf, h, h2 );
|
||||||
|
|
||||||
|
for ( u = 0; u < 16; u ++ )
|
||||||
|
buf[u] = h2[u];
|
||||||
|
|
||||||
|
compress_small_16way( buf, final_s16, h1 );
|
||||||
|
for (u = 0, v = 16 - 8; u < 8; u ++, v ++)
|
||||||
|
casti_m512i(dst,u) = h1[v];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@@ -18,16 +18,17 @@ void bmw512hash_8way(void *state, const void *input)
|
|||||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
uint32_t vdata[24*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t hash[16*8] __attribute__ ((aligned (32)));
|
uint32_t hash[16*8] __attribute__ ((aligned (64)));
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 8;
|
||||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
// const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
int thr_id = mythr->id;
|
int thr_id = mythr->id;
|
||||||
|
|
||||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
@@ -39,7 +40,8 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
|||||||
bmw512hash_8way( hash, vdata );
|
bmw512hash_8way( hash, vdata );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 8; lane++ )
|
for ( int lane = 0; lane < 8; lane++ )
|
||||||
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||||
|
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||||
{
|
{
|
||||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||||
if ( fulltest( lane_hash, ptarget ) )
|
if ( fulltest( lane_hash, ptarget ) )
|
||||||
@@ -48,15 +50,14 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
|||||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n += 4;
|
n += 8;
|
||||||
|
|
||||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart) );
|
||||||
|
|
||||||
*hashes_done = n - first_nonce + 1;
|
*hashes_done = n - first_nonce;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#elif defined(BMW512_4WAY)
|
#elif defined(BMW512_4WAY)
|
||||||
|
|
||||||
//#ifdef BMW512_4WAY
|
//#ifdef BMW512_4WAY
|
||||||
@@ -72,16 +73,17 @@ void bmw512hash_4way(void *state, const void *input)
|
|||||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
uint32_t vdata[24*4] __attribute__ ((aligned (128)));
|
||||||
uint32_t hash[16*4] __attribute__ ((aligned (32)));
|
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t last_nonce = max_nonce - 4;
|
||||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||||
// const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||||
|
|
||||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||||
@@ -92,7 +94,8 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
|||||||
bmw512hash_4way( hash, vdata );
|
bmw512hash_4way( hash, vdata );
|
||||||
|
|
||||||
for ( int lane = 0; lane < 4; lane++ )
|
for ( int lane = 0; lane < 4; lane++ )
|
||||||
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||||
|
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||||
{
|
{
|
||||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||||
if ( fulltest( lane_hash, ptarget ) )
|
if ( fulltest( lane_hash, ptarget ) )
|
||||||
@@ -103,9 +106,9 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
|||||||
}
|
}
|
||||||
n += 4;
|
n += 4;
|
||||||
|
|
||||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||||
|
|
||||||
*hashes_done = n - first_nonce + 1;
|
*hashes_done = n - first_nonce;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -58,8 +58,7 @@ static const sph_u64 IV512[] = {
|
|||||||
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
|
|
||||||
// BMW-512 2 way 64
|
// BMW-512 2 way 64
|
||||||
|
|
||||||
|
|
||||||
#define s2b0(x) \
|
#define s2b0(x) \
|
||||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
|
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
|
||||||
@@ -824,87 +823,57 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
|||||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||||
|
|
||||||
dH[ 0] = _mm256_add_epi64(
|
|
||||||
_mm256_xor_si256( M[0],
|
#define DH1L( m, sl, sr, a, b, c ) \
|
||||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
|
_mm256_add_epi64( \
|
||||||
_mm256_srli_epi64( qt[16], 5 ) ) ),
|
_mm256_xor_si256( M[m], \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
|
_mm256_xor_si256( _mm256_slli_epi64( xh, sl ), \
|
||||||
dH[ 1] = _mm256_add_epi64(
|
_mm256_srli_epi64( qt[a], sr ) ) ), \
|
||||||
_mm256_xor_si256( M[1],
|
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
|
|
||||||
_mm256_slli_epi64( qt[17], 8 ) ) ),
|
#define DH1R( m, sl, sr, a, b, c ) \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
|
_mm256_add_epi64( \
|
||||||
dH[ 2] = _mm256_add_epi64(
|
_mm256_xor_si256( M[m], \
|
||||||
_mm256_xor_si256( M[2],
|
_mm256_xor_si256( _mm256_srli_epi64( xh, sl ), \
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
|
_mm256_slli_epi64( qt[a], sr ) ) ), \
|
||||||
_mm256_slli_epi64( qt[18], 5 ) ) ),
|
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
|
|
||||||
dH[ 3] = _mm256_add_epi64(
|
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||||
_mm256_xor_si256( M[3],
|
_mm256_add_epi64( _mm256_add_epi64( \
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
|
mm256_rol_64( dH[h], rl ), \
|
||||||
_mm256_slli_epi64( qt[19], 5 ) ) ),
|
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
|
_mm256_xor_si256( _mm256_slli_epi64( xl, sl ), \
|
||||||
dH[ 4] = _mm256_add_epi64(
|
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||||
_mm256_xor_si256( M[4],
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
|
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||||
_mm256_slli_epi64( qt[20], 0 ) ) ),
|
_mm256_add_epi64( _mm256_add_epi64( \
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
|
mm256_rol_64( dH[h], rl ), \
|
||||||
dH[ 5] = _mm256_add_epi64(
|
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||||
_mm256_xor_si256( M[5],
|
_mm256_xor_si256( _mm256_srli_epi64( xl, sr ), \
|
||||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
|
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||||
_mm256_srli_epi64( qt[21], 6 ) ) ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
|
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||||
dH[ 6] = _mm256_add_epi64(
|
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||||
_mm256_xor_si256( M[6],
|
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
|
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||||
_mm256_slli_epi64( qt[22], 6 ) ) ),
|
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
|
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||||
dH[ 7] = _mm256_add_epi64(
|
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||||
_mm256_xor_si256( M[7],
|
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
|
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||||
_mm256_slli_epi64( qt[23], 2 ) ) ),
|
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
|
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||||
dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
|
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||||
mm256_rol_64( dH[4], 9 ),
|
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
|
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||||
dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
|
|
||||||
mm256_rol_64( dH[5], 10 ),
|
#undef DH1L
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
#undef DH1R
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
|
#undef DH2L
|
||||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
#undef DH2R
|
||||||
dH[10] = _mm256_add_epi64( _mm256_add_epi64(
|
}
|
||||||
mm256_rol_64( dH[6], 11 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
|
||||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
|
|
||||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
|
||||||
dH[11] = _mm256_add_epi64( _mm256_add_epi64(
|
|
||||||
mm256_rol_64( dH[7], 12 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
|
||||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
|
|
||||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
|
||||||
dH[12] = _mm256_add_epi64( _mm256_add_epi64(
|
|
||||||
mm256_rol_64( dH[0], 13 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
|
|
||||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
|
||||||
dH[13] = _mm256_add_epi64( _mm256_add_epi64(
|
|
||||||
mm256_rol_64( dH[1], 14 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
|
|
||||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
|
||||||
dH[14] = _mm256_add_epi64( _mm256_add_epi64(
|
|
||||||
mm256_rol_64( dH[2], 15 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
|
|
||||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
|
||||||
dH[15] = _mm256_add_epi64( _mm256_add_epi64(
|
|
||||||
mm256_rol_64( dH[3], 16 ),
|
|
||||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
|
||||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
|
|
||||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static const __m256i final_b[16] =
|
static const __m256i final_b[16] =
|
||||||
{
|
{
|
||||||
|
@@ -28,6 +28,10 @@ static const uint64_t IV512[] =
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||||
|
// If reinterleaving is necessary it may be more efficient to use
|
||||||
|
// 2 way 256. The same transform code should work for both.
|
||||||
|
|
||||||
static void transform_4way( cube_4way_context *sp )
|
static void transform_4way( cube_4way_context *sp )
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
@@ -201,6 +205,8 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
|||||||
|
|
||||||
#endif // AVX512
|
#endif // AVX512
|
||||||
|
|
||||||
|
// 2 way 128
|
||||||
|
|
||||||
static void transform_2way( cube_2way_context *sp )
|
static void transform_2way( cube_2way_context *sp )
|
||||||
{
|
{
|
||||||
int r;
|
int r;
|
||||||
|
@@ -1,203 +0,0 @@
|
|||||||
#if defined(__AVX2__)
|
|
||||||
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <memory.h>
|
|
||||||
#include "cube-hash-2way.h"
|
|
||||||
|
|
||||||
// 2x128
|
|
||||||
|
|
||||||
|
|
||||||
// The result of hashing 10 rounds of initial data which consists of params
|
|
||||||
// zero padded.
|
|
||||||
static const uint64_t IV256[] =
|
|
||||||
{
|
|
||||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
|
||||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
|
||||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
|
||||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
|
||||||
};
|
|
||||||
|
|
||||||
static const uint64_t IV512[] =
|
|
||||||
{
|
|
||||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
|
||||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
|
||||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
|
||||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
static void transform_2way( cube_2way_context *sp )
|
|
||||||
{
|
|
||||||
int r;
|
|
||||||
const int rounds = sp->rounds;
|
|
||||||
|
|
||||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
|
||||||
|
|
||||||
x0 = _mm256_load_si256( (__m256i*)sp->h );
|
|
||||||
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
|
|
||||||
x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
|
|
||||||
x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
|
|
||||||
x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
|
|
||||||
x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
|
|
||||||
x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
|
|
||||||
x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
|
|
||||||
|
|
||||||
for ( r = 0; r < rounds; ++r )
|
|
||||||
{
|
|
||||||
x4 = _mm256_add_epi32( x0, x4 );
|
|
||||||
x5 = _mm256_add_epi32( x1, x5 );
|
|
||||||
x6 = _mm256_add_epi32( x2, x6 );
|
|
||||||
x7 = _mm256_add_epi32( x3, x7 );
|
|
||||||
y0 = x0;
|
|
||||||
y1 = x1;
|
|
||||||
x0 = mm256_rol_32( x2, 7 );
|
|
||||||
x1 = mm256_rol_32( x3, 7 );
|
|
||||||
x2 = mm256_rol_32( y0, 7 );
|
|
||||||
x3 = mm256_rol_32( y1, 7 );
|
|
||||||
x0 = _mm256_xor_si256( x0, x4 );
|
|
||||||
x1 = _mm256_xor_si256( x1, x5 );
|
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
|
||||||
x4 = mm256_swap64_128( x4 );
|
|
||||||
x5 = mm256_swap64_128( x5 );
|
|
||||||
x6 = mm256_swap64_128( x6 );
|
|
||||||
x7 = mm256_swap64_128( x7 );
|
|
||||||
x4 = _mm256_add_epi32( x0, x4 );
|
|
||||||
x5 = _mm256_add_epi32( x1, x5 );
|
|
||||||
x6 = _mm256_add_epi32( x2, x6 );
|
|
||||||
x7 = _mm256_add_epi32( x3, x7 );
|
|
||||||
y0 = x0;
|
|
||||||
y1 = x2;
|
|
||||||
x0 = mm256_rol_32( x1, 11 );
|
|
||||||
x1 = mm256_rol_32( y0, 11 );
|
|
||||||
x2 = mm256_rol_32( x3, 11 );
|
|
||||||
x3 = mm256_rol_32( y1, 11 );
|
|
||||||
x0 = _mm256_xor_si256( x0, x4 );
|
|
||||||
x1 = _mm256_xor_si256( x1, x5 );
|
|
||||||
x2 = _mm256_xor_si256( x2, x6 );
|
|
||||||
x3 = _mm256_xor_si256( x3, x7 );
|
|
||||||
x4 = mm256_swap32_64( x4 );
|
|
||||||
x5 = mm256_swap32_64( x5 );
|
|
||||||
x6 = mm256_swap32_64( x6 );
|
|
||||||
x7 = mm256_swap32_64( x7 );
|
|
||||||
}
|
|
||||||
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h + 1, x1 );
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h + 2, x2 );
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h + 3, x3 );
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h + 4, x4 );
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
|
||||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
|
||||||
int blockbytes )
|
|
||||||
{
|
|
||||||
__m256i *h = (__m256i*)sp->h;
|
|
||||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
|
||||||
: (__m128i*)IV256 );
|
|
||||||
sp->hashlen = hashbitlen/128;
|
|
||||||
sp->blocksize = blockbytes/16;
|
|
||||||
sp->rounds = rounds;
|
|
||||||
sp->pos = 0;
|
|
||||||
|
|
||||||
h[ 0] = m256_const1_128( iv[0] );
|
|
||||||
h[ 1] = m256_const1_128( iv[1] );
|
|
||||||
h[ 2] = m256_const1_128( iv[2] );
|
|
||||||
h[ 3] = m256_const1_128( iv[3] );
|
|
||||||
h[ 4] = m256_const1_128( iv[4] );
|
|
||||||
h[ 5] = m256_const1_128( iv[5] );
|
|
||||||
h[ 6] = m256_const1_128( iv[6] );
|
|
||||||
h[ 7] = m256_const1_128( iv[7] );
|
|
||||||
h[ 0] = m256_const1_128( iv[0] );
|
|
||||||
h[ 1] = m256_const1_128( iv[1] );
|
|
||||||
h[ 2] = m256_const1_128( iv[2] );
|
|
||||||
h[ 3] = m256_const1_128( iv[3] );
|
|
||||||
h[ 4] = m256_const1_128( iv[4] );
|
|
||||||
h[ 5] = m256_const1_128( iv[5] );
|
|
||||||
h[ 6] = m256_const1_128( iv[6] );
|
|
||||||
h[ 7] = m256_const1_128( iv[7] );
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
|
||||||
{
|
|
||||||
const int len = size >> 4;
|
|
||||||
const __m256i *in = (__m256i*)data;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
|
||||||
// Current usage sata is either 64 or 80 bytes.
|
|
||||||
|
|
||||||
for ( i = 0; i < len; i++ )
|
|
||||||
{
|
|
||||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
|
||||||
sp->pos++;
|
|
||||||
if ( sp->pos == sp->blocksize )
|
|
||||||
{
|
|
||||||
transform_2way( sp );
|
|
||||||
sp->pos = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int cube_2way_close( cube_2way_context *sp, void *output )
|
|
||||||
{
|
|
||||||
__m256i *hash = (__m256i*)output;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
|
||||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
|
||||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
|
||||||
transform_2way( sp );
|
|
||||||
|
|
||||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
|
||||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
|
||||||
|
|
||||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
|
||||||
|
|
||||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
|
||||||
const void *data, size_t size )
|
|
||||||
{
|
|
||||||
const int len = size >> 4;
|
|
||||||
const __m256i *in = (__m256i*)data;
|
|
||||||
__m256i *hash = (__m256i*)output;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for ( i = 0; i < len; i++ )
|
|
||||||
{
|
|
||||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
|
||||||
sp->pos++;
|
|
||||||
if ( sp->pos == sp->blocksize )
|
|
||||||
{
|
|
||||||
transform_2way( sp );
|
|
||||||
sp->pos = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
|
||||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
|
||||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
|
||||||
transform_2way( sp );
|
|
||||||
|
|
||||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
|
||||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
|
||||||
|
|
||||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
|
||||||
|
|
||||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@@ -1,36 +0,0 @@
|
|||||||
#ifndef CUBE_HASH_2WAY_H__
|
|
||||||
#define CUBE_HASH_2WAY_H__
|
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include "simd-utils.h"
|
|
||||||
|
|
||||||
// 2x128, 2 way parallel SSE2
|
|
||||||
|
|
||||||
struct _cube_2way_context
|
|
||||||
{
|
|
||||||
__m256i h[8];
|
|
||||||
int hashlen; // __m128i
|
|
||||||
int rounds;
|
|
||||||
int blocksize; // __m128i
|
|
||||||
int pos; // number of __m128i read into x from current block
|
|
||||||
} __attribute__ ((aligned (64)));
|
|
||||||
|
|
||||||
typedef struct _cube_2way_context cube_2way_context;
|
|
||||||
|
|
||||||
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
|
|
||||||
int blockbytes );
|
|
||||||
// reinitialize context with same parameters, much faster.
|
|
||||||
int cube_2way_reinit( cube_2way_context *sp );
|
|
||||||
|
|
||||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
|
|
||||||
|
|
||||||
int cube_2way_close( cube_2way_context *sp, void *output );
|
|
||||||
|
|
||||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
|
||||||
const void *data, size_t size );
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
@@ -1,6 +1,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#include "luffa-hash-2way.h"
|
#include "luffa-hash-2way.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX2__)
|
||||||
|
|
||||||
@@ -318,22 +319,6 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
|||||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||||
|
|
||||||
MULT24W( chainv[2], chainv[3], MASK );
|
|
||||||
chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
|
|
||||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
|
||||||
|
|
||||||
MULT24W( chainv[0], chainv[1], MASK );
|
|
||||||
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
|
|
||||||
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
|
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
|
||||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
|
||||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
|
||||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
|
||||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
|
||||||
|
|
||||||
MULT24W( msg0, msg1, MASK );
|
MULT24W( msg0, msg1, MASK );
|
||||||
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
||||||
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
||||||
@@ -345,14 +330,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
|||||||
MULT24W( msg0, msg1, MASK );
|
MULT24W( msg0, msg1, MASK );
|
||||||
|
|
||||||
// replace with ror
|
// replace with ror
|
||||||
chainv[3] = _mm512_or_si512( _mm512_slli_epi32( chainv[3], 1 ),
|
chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
|
||||||
_mm512_srli_epi32( chainv[3], 31 ) );
|
chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
|
||||||
chainv[5] = _mm512_or_si512( _mm512_slli_epi32( chainv[5], 2 ),
|
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
||||||
_mm512_srli_epi32( chainv[5], 30 ) );
|
chainv[9] = _mm512_rol_epi32( chainv[9], 4 );
|
||||||
chainv[7] = _mm512_or_si512( _mm512_slli_epi32( chainv[7], 3 ),
|
|
||||||
_mm512_srli_epi32( chainv[7], 29 ) );
|
|
||||||
chainv[9] = _mm512_or_si512( _mm512_slli_epi32( chainv[9], 4 ),
|
|
||||||
_mm512_srli_epi32( chainv[9], 28 ) );
|
|
||||||
|
|
||||||
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
|
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||||
x[0], x[1], x[2], x[3],
|
x[0], x[1], x[2], x[3],
|
||||||
@@ -394,7 +375,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
|||||||
|
|
||||||
void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||||
{
|
{
|
||||||
uint32 hash[8*4] __attribute((aligned(128)));
|
uint32_t hash[8*4] __attribute((aligned(128)));
|
||||||
__m512i* chainv = state->chainv;
|
__m512i* chainv = state->chainv;
|
||||||
__m512i t[2];
|
__m512i t[2];
|
||||||
__m512i zero[2];
|
__m512i zero[2];
|
||||||
@@ -424,7 +405,7 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
|||||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||||
|
|
||||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||||
_mm512_store_si512( (__m512i*)&hash[8], t[1] );
|
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||||
|
|
||||||
casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
|
casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
|
||||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||||
@@ -448,7 +429,7 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
|||||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||||
|
|
||||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||||
_mm512_store_si512( (__m512i*)&hash[8], t[1] );
|
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||||
|
|
||||||
casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
|
casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
|
||||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||||
@@ -493,8 +474,8 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
|||||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
|
|
||||||
state-> rembytes = (int)len & 0x1F;
|
state->rembytes = (int)len & 0x1F;
|
||||||
|
|
||||||
// full blocks
|
// full blocks
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||||
@@ -578,8 +559,9 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
|||||||
}
|
}
|
||||||
|
|
||||||
finalization512_4way( state, (uint32*)output );
|
finalization512_4way( state, (uint32*)output );
|
||||||
|
|
||||||
if ( state->hashbitlen > 512 )
|
if ( state->hashbitlen > 512 )
|
||||||
finalization512_4way( state, (uint32*)( output+32 ) );
|
finalization512_4way( state, (uint32*)( output+64 ) );
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -860,14 +842,10 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
|||||||
|
|
||||||
MULT2( msg0, msg1, MASK );
|
MULT2( msg0, msg1, MASK );
|
||||||
|
|
||||||
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
|
chainv[3] = mm256_rol_32( chainv[3], 1 );
|
||||||
_mm256_srli_epi32( chainv[3], 31 ) );
|
chainv[5] = mm256_rol_32( chainv[5], 2 );
|
||||||
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
|
chainv[7] = mm256_rol_32( chainv[7], 3 );
|
||||||
_mm256_srli_epi32( chainv[5], 30 ) );
|
chainv[9] = mm256_rol_32( chainv[9], 4 );
|
||||||
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
|
|
||||||
_mm256_srli_epi32( chainv[7], 29 ) );
|
|
||||||
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
|
|
||||||
_mm256_srli_epi32( chainv[9], 28 ) );
|
|
||||||
|
|
||||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||||
x[0], x[1], x[2], x[3],
|
x[0], x[1], x[2], x[3],
|
||||||
@@ -1093,6 +1071,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
|||||||
}
|
}
|
||||||
|
|
||||||
finalization512_2way( state, (uint32*)output );
|
finalization512_2way( state, (uint32*)output );
|
||||||
|
|
||||||
if ( state->hashbitlen > 512 )
|
if ( state->hashbitlen > 512 )
|
||||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
finalization512_2way( state, (uint32*)( output+32 ) );
|
||||||
|
|
||||||
|
@@ -1,573 +0,0 @@
|
|||||||
#include <string.h>
|
|
||||||
#include <immintrin.h>
|
|
||||||
#include "luffa-hash-2way.h"
|
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
|
||||||
|
|
||||||
#include "simd-utils.h"
|
|
||||||
|
|
||||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
|
||||||
|
|
||||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
|
||||||
a = _mm256_xor_si256(a,c0);\
|
|
||||||
b = _mm256_xor_si256(b,c1);\
|
|
||||||
|
|
||||||
#define MULT2( a0, a1, mask ) \
|
|
||||||
do { \
|
|
||||||
__m256i b = _mm256_xor_si256( a0, \
|
|
||||||
_mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
|
|
||||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
|
||||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
|
||||||
} while(0)
|
|
||||||
|
|
||||||
// confirm pointer arithmetic
|
|
||||||
// ok but use array indexes
|
|
||||||
#define STEP_PART(x,c0,c1,t)\
|
|
||||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
|
||||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
|
||||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
|
||||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
|
||||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
|
||||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
|
||||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
|
||||||
|
|
||||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
|
||||||
t = _mm256_load_si256(&a0);\
|
|
||||||
a0 = _mm256_or_si256(a0,a1);\
|
|
||||||
a2 = _mm256_xor_si256(a2,a3);\
|
|
||||||
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
|
|
||||||
a0 = _mm256_xor_si256(a0,a3);\
|
|
||||||
a3 = _mm256_and_si256(a3,t);\
|
|
||||||
a1 = _mm256_xor_si256(a1,a3);\
|
|
||||||
a3 = _mm256_xor_si256(a3,a2);\
|
|
||||||
a2 = _mm256_and_si256(a2,a0);\
|
|
||||||
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
|
|
||||||
a2 = _mm256_xor_si256(a2,a1);\
|
|
||||||
a1 = _mm256_or_si256(a1,a3);\
|
|
||||||
t = _mm256_xor_si256(t,a1);\
|
|
||||||
a3 = _mm256_xor_si256(a3,a2);\
|
|
||||||
a2 = _mm256_and_si256(a2,a1);\
|
|
||||||
a1 = _mm256_xor_si256(a1,a0);\
|
|
||||||
a0 = _mm256_load_si256(&t);\
|
|
||||||
|
|
||||||
#define MIXWORD(a,b,t1,t2)\
|
|
||||||
b = _mm256_xor_si256(a,b);\
|
|
||||||
t1 = _mm256_slli_epi32(a,2);\
|
|
||||||
t2 = _mm256_srli_epi32(a,30);\
|
|
||||||
a = _mm256_or_si256(t1,t2);\
|
|
||||||
a = _mm256_xor_si256(a,b);\
|
|
||||||
t1 = _mm256_slli_epi32(b,14);\
|
|
||||||
t2 = _mm256_srli_epi32(b,18);\
|
|
||||||
b = _mm256_or_si256(t1,t2);\
|
|
||||||
b = _mm256_xor_si256(a,b);\
|
|
||||||
t1 = _mm256_slli_epi32(a,10);\
|
|
||||||
t2 = _mm256_srli_epi32(a,22);\
|
|
||||||
a = _mm256_or_si256(t1,t2);\
|
|
||||||
a = _mm256_xor_si256(a,b);\
|
|
||||||
t1 = _mm256_slli_epi32(b,1);\
|
|
||||||
t2 = _mm256_srli_epi32(b,31);\
|
|
||||||
b = _mm256_or_si256(t1,t2);
|
|
||||||
|
|
||||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
|
||||||
a1 = _mm256_shuffle_epi32(a1,147);\
|
|
||||||
t0 = _mm256_load_si256(&a1);\
|
|
||||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
|
||||||
t0 = _mm256_unpackhi_epi32(t0,a0);\
|
|
||||||
t1 = _mm256_shuffle_epi32(t0,78);\
|
|
||||||
a0 = _mm256_shuffle_epi32(a1,78);\
|
|
||||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
|
||||||
t0 = _mm256_unpacklo_epi32(t0,t1);\
|
|
||||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
|
||||||
a0 = _mm256_load_si256(&a1);\
|
|
||||||
a0 = _mm256_unpackhi_epi64(a0,t0);\
|
|
||||||
a1 = _mm256_unpacklo_epi64(a1,t0);\
|
|
||||||
a1 = _mm256_shuffle_epi32(a1,57);\
|
|
||||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
|
||||||
ADD_CONSTANT(a0,a1,c0,c1);
|
|
||||||
|
|
||||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
|
||||||
s2 = _mm256_load_si256(&r1);\
|
|
||||||
q2 = _mm256_load_si256(&p1);\
|
|
||||||
r2 = _mm256_shuffle_epi32(r2,216);\
|
|
||||||
p2 = _mm256_shuffle_epi32(p2,216);\
|
|
||||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
|
||||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
|
||||||
s2 = _mm256_unpackhi_epi32(s2,r0);\
|
|
||||||
q2 = _mm256_unpackhi_epi32(q2,p0);\
|
|
||||||
s0 = _mm256_load_si256(&r2);\
|
|
||||||
q0 = _mm256_load_si256(&p2);\
|
|
||||||
r2 = _mm256_unpacklo_epi64(r2,r1);\
|
|
||||||
p2 = _mm256_unpacklo_epi64(p2,p1);\
|
|
||||||
s1 = _mm256_load_si256(&s0);\
|
|
||||||
q1 = _mm256_load_si256(&q0);\
|
|
||||||
s0 = _mm256_unpackhi_epi64(s0,r1);\
|
|
||||||
q0 = _mm256_unpackhi_epi64(q0,p1);\
|
|
||||||
r2 = _mm256_shuffle_epi32(r2,225);\
|
|
||||||
p2 = _mm256_shuffle_epi32(p2,225);\
|
|
||||||
r0 = _mm256_load_si256(&s1);\
|
|
||||||
p0 = _mm256_load_si256(&q1);\
|
|
||||||
s0 = _mm256_shuffle_epi32(s0,225);\
|
|
||||||
q0 = _mm256_shuffle_epi32(q0,225);\
|
|
||||||
s1 = _mm256_unpacklo_epi64(s1,s2);\
|
|
||||||
q1 = _mm256_unpacklo_epi64(q1,q2);\
|
|
||||||
r0 = _mm256_unpackhi_epi64(r0,s2);\
|
|
||||||
p0 = _mm256_unpackhi_epi64(p0,q2);\
|
|
||||||
s2 = _mm256_load_si256(&r0);\
|
|
||||||
q2 = _mm256_load_si256(&p0);\
|
|
||||||
s3 = _mm256_load_si256(&r2);\
|
|
||||||
q3 = _mm256_load_si256(&p2);\
|
|
||||||
|
|
||||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
|
||||||
s0 = _mm256_load_si256(&r0);\
|
|
||||||
q0 = _mm256_load_si256(&p0);\
|
|
||||||
s1 = _mm256_load_si256(&r2);\
|
|
||||||
q1 = _mm256_load_si256(&p2);\
|
|
||||||
r0 = _mm256_unpackhi_epi32(r0,r1);\
|
|
||||||
p0 = _mm256_unpackhi_epi32(p0,p1);\
|
|
||||||
r2 = _mm256_unpackhi_epi32(r2,r3);\
|
|
||||||
p2 = _mm256_unpackhi_epi32(p2,p3);\
|
|
||||||
s0 = _mm256_unpacklo_epi32(s0,r1);\
|
|
||||||
q0 = _mm256_unpacklo_epi32(q0,p1);\
|
|
||||||
s1 = _mm256_unpacklo_epi32(s1,r3);\
|
|
||||||
q1 = _mm256_unpacklo_epi32(q1,p3);\
|
|
||||||
r1 = _mm256_load_si256(&r0);\
|
|
||||||
p1 = _mm256_load_si256(&p0);\
|
|
||||||
r0 = _mm256_unpackhi_epi64(r0,r2);\
|
|
||||||
p0 = _mm256_unpackhi_epi64(p0,p2);\
|
|
||||||
s0 = _mm256_unpackhi_epi64(s0,s1);\
|
|
||||||
q0 = _mm256_unpackhi_epi64(q0,q1);\
|
|
||||||
r1 = _mm256_unpacklo_epi64(r1,r2);\
|
|
||||||
p1 = _mm256_unpacklo_epi64(p1,p2);\
|
|
||||||
s2 = _mm256_load_si256(&r0);\
|
|
||||||
q2 = _mm256_load_si256(&p0);\
|
|
||||||
s1 = _mm256_load_si256(&r1);\
|
|
||||||
q1 = _mm256_load_si256(&p1);\
|
|
||||||
|
|
||||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
|
||||||
s1 = _mm256_load_si256(&r3);\
|
|
||||||
q1 = _mm256_load_si256(&p3);\
|
|
||||||
s3 = _mm256_load_si256(&r3);\
|
|
||||||
q3 = _mm256_load_si256(&p3);\
|
|
||||||
s1 = _mm256_unpackhi_epi32(s1,r2);\
|
|
||||||
q1 = _mm256_unpackhi_epi32(q1,p2);\
|
|
||||||
s3 = _mm256_unpacklo_epi32(s3,r2);\
|
|
||||||
q3 = _mm256_unpacklo_epi32(q3,p2);\
|
|
||||||
s0 = _mm256_load_si256(&s1);\
|
|
||||||
q0 = _mm256_load_si256(&q1);\
|
|
||||||
s2 = _mm256_load_si256(&s3);\
|
|
||||||
q2 = _mm256_load_si256(&q3);\
|
|
||||||
r3 = _mm256_load_si256(&r1);\
|
|
||||||
p3 = _mm256_load_si256(&p1);\
|
|
||||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
|
||||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
|
||||||
r3 = _mm256_unpackhi_epi32(r3,r0);\
|
|
||||||
p3 = _mm256_unpackhi_epi32(p3,p0);\
|
|
||||||
s0 = _mm256_unpackhi_epi64(s0,r3);\
|
|
||||||
q0 = _mm256_unpackhi_epi64(q0,p3);\
|
|
||||||
s1 = _mm256_unpacklo_epi64(s1,r3);\
|
|
||||||
q1 = _mm256_unpacklo_epi64(q1,p3);\
|
|
||||||
s2 = _mm256_unpackhi_epi64(s2,r1);\
|
|
||||||
q2 = _mm256_unpackhi_epi64(q2,p1);\
|
|
||||||
s3 = _mm256_unpacklo_epi64(s3,r1);\
|
|
||||||
q3 = _mm256_unpacklo_epi64(q3,p1);
|
|
||||||
|
|
||||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
|
||||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
|
||||||
|
|
||||||
/* initial values of chaining variables */
|
|
||||||
static const uint32 IV[40] __attribute((aligned(32))) = {
|
|
||||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
|
||||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
|
||||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
|
||||||
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
|
|
||||||
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
|
|
||||||
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
|
|
||||||
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
|
|
||||||
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
|
|
||||||
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
|
|
||||||
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Round Constants */
|
|
||||||
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
|
|
||||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
|
||||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
|
||||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
|
||||||
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
|
|
||||||
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
|
|
||||||
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
|
|
||||||
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
|
|
||||||
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
|
|
||||||
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
|
|
||||||
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
|
|
||||||
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
|
|
||||||
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
|
|
||||||
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
|
|
||||||
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
|
|
||||||
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
|
|
||||||
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
|
|
||||||
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x5090d577,
|
|
||||||
0x00000000,0x00000000,0x00000000,0xac11d7fa,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x2d1925ab,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
|
|
||||||
0x00000000,0x00000000,0x00000000,0xb46496ac,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
|
|
||||||
0x00000000,0x00000000,0x00000000,0xd1925ab0,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x78602649,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x29131ab6,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x8edae952,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x0fc053c3,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x3b6ba548,
|
|
||||||
0x00000000,0x00000000,0x00000000,0x3f014f0c,
|
|
||||||
0x00000000,0x00000000,0x00000000,0xedae9520,
|
|
||||||
0x00000000,0x00000000,0x00000000,0xfc053c31
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/***************************************************/
|
|
||||||
/* Round function */
|
|
||||||
/* state: hash context */
|
|
||||||
|
|
||||||
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
|
||||||
{
|
|
||||||
__m256i t0, t1;
|
|
||||||
__m256i *chainv = state->chainv;
|
|
||||||
__m256i msg0, msg1;
|
|
||||||
__m256i tmp[2];
|
|
||||||
__m256i x[8];
|
|
||||||
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
|
|
||||||
|
|
||||||
t0 = chainv[0];
|
|
||||||
t1 = chainv[1];
|
|
||||||
|
|
||||||
t0 = _mm256_xor_si256( t0, chainv[2] );
|
|
||||||
t1 = _mm256_xor_si256( t1, chainv[3] );
|
|
||||||
t0 = _mm256_xor_si256( t0, chainv[4] );
|
|
||||||
t1 = _mm256_xor_si256( t1, chainv[5] );
|
|
||||||
t0 = _mm256_xor_si256( t0, chainv[6] );
|
|
||||||
t1 = _mm256_xor_si256( t1, chainv[7] );
|
|
||||||
t0 = _mm256_xor_si256( t0, chainv[8] );
|
|
||||||
t1 = _mm256_xor_si256( t1, chainv[9] );
|
|
||||||
|
|
||||||
MULT2( t0, t1, MASK );
|
|
||||||
|
|
||||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
|
||||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
|
||||||
|
|
||||||
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
|
|
||||||
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
|
|
||||||
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
|
|
||||||
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
|
|
||||||
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
|
|
||||||
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
|
|
||||||
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
|
|
||||||
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
|
|
||||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
|
||||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
|
||||||
|
|
||||||
t0 = chainv[0];
|
|
||||||
t1 = chainv[1];
|
|
||||||
|
|
||||||
MULT2( chainv[0], chainv[1], MASK );
|
|
||||||
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
|
|
||||||
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
|
|
||||||
|
|
||||||
MULT2( chainv[2], chainv[3], MASK );
|
|
||||||
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
|
|
||||||
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
|
|
||||||
|
|
||||||
MULT2( chainv[4], chainv[5], MASK );
|
|
||||||
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
|
|
||||||
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
|
|
||||||
|
|
||||||
MULT2( chainv[6], chainv[7], MASK );
|
|
||||||
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
|
|
||||||
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
|
|
||||||
|
|
||||||
MULT2( chainv[8], chainv[9], MASK );
|
|
||||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
|
||||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
|
||||||
|
|
||||||
t0 = chainv[8];
|
|
||||||
t1 = chainv[9];
|
|
||||||
|
|
||||||
MULT2( chainv[8], chainv[9], MASK );
|
|
||||||
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
|
|
||||||
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
|
|
||||||
|
|
||||||
MULT2( chainv[6], chainv[7], MASK );
|
|
||||||
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
|
|
||||||
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
|
|
||||||
|
|
||||||
MULT2( chainv[4], chainv[5], MASK );
|
|
||||||
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
|
|
||||||
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
|
|
||||||
|
|
||||||
MULT2( chainv[2], chainv[3], MASK );
|
|
||||||
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
|
|
||||||
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
|
|
||||||
|
|
||||||
MULT2( chainv[0], chainv[1], MASK );
|
|
||||||
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
|
|
||||||
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
|
|
||||||
|
|
||||||
MULT2( msg0, msg1, MASK );
|
|
||||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
|
||||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
|
||||||
|
|
||||||
MULT2( msg0, msg1, MASK );
|
|
||||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
|
||||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
|
||||||
|
|
||||||
MULT2( msg0, msg1, MASK );
|
|
||||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
|
||||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
|
||||||
|
|
||||||
MULT2( msg0, msg1, MASK );
|
|
||||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
|
||||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
|
||||||
|
|
||||||
MULT2( msg0, msg1, MASK );
|
|
||||||
|
|
||||||
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
|
|
||||||
_mm256_srli_epi32( chainv[3], 31 ) );
|
|
||||||
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
|
|
||||||
_mm256_srli_epi32( chainv[5], 30 ) );
|
|
||||||
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
|
|
||||||
_mm256_srli_epi32( chainv[7], 29 ) );
|
|
||||||
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
|
|
||||||
_mm256_srli_epi32( chainv[9], 28 ) );
|
|
||||||
|
|
||||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
|
||||||
x[0], x[1], x[2], x[3],
|
|
||||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
|
||||||
x[4], x[5], x[6], x[7] );
|
|
||||||
|
|
||||||
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
|
|
||||||
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
|
|
||||||
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
|
|
||||||
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
|
|
||||||
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
|
|
||||||
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
|
|
||||||
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
|
|
||||||
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
|
|
||||||
|
|
||||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
|
||||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
|
||||||
x[4], x[5], x[6], x[7],
|
|
||||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
|
||||||
|
|
||||||
/* Process last 256-bit block */
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
|
|
||||||
tmp[0], tmp[1] );
|
|
||||||
}
|
|
||||||
|
|
||||||
/***************************************************/
|
|
||||||
/* Finalization function */
|
|
||||||
/* state: hash context */
|
|
||||||
/* b[8]: hash values */
|
|
||||||
|
|
||||||
void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
|
||||||
{
|
|
||||||
uint32 hash[8] __attribute((aligned(64)));
|
|
||||||
__m256i* chainv = state->chainv;
|
|
||||||
__m256i t[2];
|
|
||||||
__m256i zero[2];
|
|
||||||
zero[0] = zero[1] = m256_zero;
|
|
||||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
|
||||||
0x1415161710111213,
|
|
||||||
0x0c0d0e0f08090a0b,
|
|
||||||
0x0405060700010203 );
|
|
||||||
/*---- blank round with m=0 ----*/
|
|
||||||
rnd512_2way( state, zero );
|
|
||||||
|
|
||||||
t[0] = chainv[0];
|
|
||||||
t[1] = chainv[1];
|
|
||||||
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
|
||||||
|
|
||||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
|
||||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
|
||||||
|
|
||||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
|
||||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
|
||||||
|
|
||||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
|
||||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
|
||||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
|
||||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
|
||||||
|
|
||||||
rnd512_2way( state, zero );
|
|
||||||
|
|
||||||
t[0] = chainv[0];
|
|
||||||
t[1] = chainv[1];
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
|
||||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
|
||||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
|
||||||
|
|
||||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
|
||||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
|
||||||
|
|
||||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
|
||||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
|
||||||
|
|
||||||
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
|
|
||||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
|
||||||
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
|
|
||||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
|
||||||
}
|
|
||||||
|
|
||||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
|
||||||
{
|
|
||||||
state->hashbitlen = hashbitlen;
|
|
||||||
__m128i *iv = (__m128i*)IV;
|
|
||||||
|
|
||||||
state->chainv[0] = m256_const1_128( iv[0] );
|
|
||||||
state->chainv[1] = m256_const1_128( iv[1] );
|
|
||||||
state->chainv[2] = m256_const1_128( iv[2] );
|
|
||||||
state->chainv[3] = m256_const1_128( iv[3] );
|
|
||||||
state->chainv[4] = m256_const1_128( iv[4] );
|
|
||||||
state->chainv[5] = m256_const1_128( iv[5] );
|
|
||||||
state->chainv[6] = m256_const1_128( iv[6] );
|
|
||||||
state->chainv[7] = m256_const1_128( iv[7] );
|
|
||||||
state->chainv[8] = m256_const1_128( iv[8] );
|
|
||||||
state->chainv[9] = m256_const1_128( iv[9] );
|
|
||||||
|
|
||||||
((__m256i*)state->buffer)[0] = m256_zero;
|
|
||||||
((__m256i*)state->buffer)[1] = m256_zero;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Do not call luffa_update_close after having called luffa_update.
|
|
||||||
// Once luffa_update has been called only call luffa_update or luffa_close.
|
|
||||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
|
||||||
size_t len )
|
|
||||||
{
|
|
||||||
__m256i *vdata = (__m256i*)data;
|
|
||||||
__m256i *buffer = (__m256i*)state->buffer;
|
|
||||||
__m256i msg[2];
|
|
||||||
int i;
|
|
||||||
int blocks = (int)len >> 5;
|
|
||||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
|
||||||
0x1415161710111213,
|
|
||||||
0x0c0d0e0f08090a0b,
|
|
||||||
0x0405060700010203 );
|
|
||||||
state-> rembytes = (int)len & 0x1F;
|
|
||||||
|
|
||||||
// full blocks
|
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
|
||||||
{
|
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
|
||||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
|
||||||
rnd512_2way( state, msg );
|
|
||||||
}
|
|
||||||
|
|
||||||
// 16 byte partial block exists for 80 byte len
|
|
||||||
// store in buffer for transform in final for midstate to work
|
|
||||||
if ( state->rembytes )
|
|
||||||
{
|
|
||||||
// remaining data bytes
|
|
||||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
|
||||||
buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
|
||||||
{
|
|
||||||
__m256i *buffer = (__m256i*)state->buffer;
|
|
||||||
__m256i msg[2];
|
|
||||||
|
|
||||||
// transform pad block
|
|
||||||
if ( state->rembytes )
|
|
||||||
// not empty, data is in buffer
|
|
||||||
rnd512_2way( state, buffer );
|
|
||||||
else
|
|
||||||
{ // empty pad block, constant data
|
|
||||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
|
||||||
msg[1] = m256_zero;
|
|
||||||
rnd512_2way( state, msg );
|
|
||||||
}
|
|
||||||
finalization512_2way( state, (uint32*)hashval );
|
|
||||||
|
|
||||||
if ( state->hashbitlen > 512 )
|
|
||||||
finalization512_2way( state, (uint32*)( hashval+32 ) );
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int luffa_2way_update_close( luffa_2way_context *state,
|
|
||||||
void *output, const void *data, size_t inlen )
|
|
||||||
{
|
|
||||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
|
||||||
const __m256i *vdata = (__m256i*)data;
|
|
||||||
__m256i msg[2];
|
|
||||||
int i;
|
|
||||||
const int blocks = (int)( inlen >> 5 );
|
|
||||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
|
||||||
0x1415161710111213,
|
|
||||||
0x0c0d0e0f08090a0b,
|
|
||||||
0x0405060700010203 );
|
|
||||||
|
|
||||||
state->rembytes = inlen & 0x1F;
|
|
||||||
|
|
||||||
// full blocks
|
|
||||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
|
||||||
{
|
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
|
||||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
|
||||||
rnd512_2way( state, msg );
|
|
||||||
}
|
|
||||||
|
|
||||||
// 16 byte partial block exists for 80 byte len
|
|
||||||
if ( state->rembytes )
|
|
||||||
{
|
|
||||||
// padding of partial block
|
|
||||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
|
||||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
|
||||||
rnd512_2way( state, msg );
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// empty pad block
|
|
||||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
|
||||||
msg[1] = m256_zero;
|
|
||||||
rnd512_2way( state, msg );
|
|
||||||
}
|
|
||||||
|
|
||||||
finalization512_2way( state, (uint32*)output );
|
|
||||||
if ( state->hashbitlen > 512 )
|
|
||||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@@ -1,69 +0,0 @@
|
|||||||
#if !defined(LUFFA_HASH_2WAY_H__)
|
|
||||||
#define LUFFA_HASH_2WAY_H__ 1
|
|
||||||
/*
|
|
||||||
* luffa_for_sse2.h
|
|
||||||
* Version 2.0 (Sep 15th 2009)
|
|
||||||
*
|
|
||||||
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
|
|
||||||
*
|
|
||||||
* Hitachi, Ltd. is the owner of this software and hereby grant
|
|
||||||
* the U.S. Government and any interested party the right to use
|
|
||||||
* this software for the purposes of the SHA-3 evaluation process,
|
|
||||||
* notwithstanding that this software is copyrighted.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
||||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
||||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
||||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
||||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
||||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
||||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
|
||||||
|
|
||||||
#include <immintrin.h>
|
|
||||||
#include "algo/sha/sha3-defs.h"
|
|
||||||
#include "simd-utils.h"
|
|
||||||
|
|
||||||
/* The length of digests*/
|
|
||||||
#define DIGEST_BIT_LEN_224 224
|
|
||||||
#define DIGEST_BIT_LEN_256 256
|
|
||||||
#define DIGEST_BIT_LEN_384 384
|
|
||||||
#define DIGEST_BIT_LEN_512 512
|
|
||||||
|
|
||||||
/*********************************/
|
|
||||||
/* The parameters of Luffa */
|
|
||||||
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
|
|
||||||
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
|
|
||||||
* of a message block*/
|
|
||||||
|
|
||||||
/* The number of blocks in Luffa */
|
|
||||||
#define WIDTH_224 3
|
|
||||||
#define WIDTH_256 3
|
|
||||||
#define WIDTH_384 4
|
|
||||||
#define WIDTH_512 5
|
|
||||||
|
|
||||||
/* The limit of the length of message */
|
|
||||||
#define LIMIT_224 64
|
|
||||||
#define LIMIT_256 64
|
|
||||||
#define LIMIT_384 128
|
|
||||||
#define LIMIT_512 128
|
|
||||||
/*********************************/
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
uint32 buffer[8*2] __attribute((aligned(64)));
|
|
||||||
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
|
||||||
int hashbitlen;
|
|
||||||
int rembytes;
|
|
||||||
} luffa_2way_context;
|
|
||||||
|
|
||||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
|
|
||||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
|
||||||
size_t len );
|
|
||||||
int luffa_2way_close( luffa_2way_context *state, void *hashval );
|
|
||||||
int luffa_2way_update_close( luffa_2way_context *state, void *output,
|
|
||||||
const void *data, size_t inlen );
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
715
algo/lyra2/lyra2-hash-2way.c
Normal file
715
algo/lyra2/lyra2-hash-2way.c
Normal file
@@ -0,0 +1,715 @@
|
|||||||
|
/**
|
||||||
|
* Implementation of the Lyra2 Password Hashing Scheme (PHS).
|
||||||
|
*
|
||||||
|
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||||
|
*
|
||||||
|
* This software is hereby placed in the public domain.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||||
|
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
|
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||||
|
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <mm_malloc.h>
|
||||||
|
#include "compat.h"
|
||||||
|
#include "lyra2.h"
|
||||||
|
#include "sponge.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||||
|
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
|
||||||
|
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
|
||||||
|
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
|
||||||
|
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
|
||||||
|
*
|
||||||
|
* @param K The derived key to be output by the algorithm
|
||||||
|
* @param kLen Desired key length
|
||||||
|
* @param pwd User password
|
||||||
|
* @param pwdlen Password length
|
||||||
|
* @param salt Salt
|
||||||
|
* @param saltlen Salt length
|
||||||
|
* @param timeCost Parameter to determine the processing time (T)
|
||||||
|
* @param nRows Number or rows of the memory matrix (R)
|
||||||
|
* @param nCols Number of columns of the memory matrix (C)
|
||||||
|
*
|
||||||
|
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
|
||||||
|
*/
|
||||||
|
|
||||||
|
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||||
|
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||||
|
const uint64_t timeCost, const uint64_t nRows,
|
||||||
|
const uint64_t nCols )
|
||||||
|
{
|
||||||
|
//====================== Basic variables ============================//
|
||||||
|
uint64_t _ALIGN(256) state[16];
|
||||||
|
int64_t row = 2; //index of row to be processed
|
||||||
|
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||||
|
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||||
|
int64_t tau; //Time Loop iterator
|
||||||
|
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||||
|
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||||
|
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||||
|
// int64_t i; //auxiliary iteration counter
|
||||||
|
int64_t v64; // 64bit var for memcpy
|
||||||
|
//====================================================================/
|
||||||
|
|
||||||
|
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||||
|
//Tries to allocate enough space for the whole memory matrix
|
||||||
|
|
||||||
|
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||||
|
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||||
|
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||||
|
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||||
|
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||||
|
uint64_t *ptrWord = wholeMatrix;
|
||||||
|
|
||||||
|
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||||
|
|
||||||
|
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||||
|
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||||
|
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||||
|
|
||||||
|
//First, we clean enough blocks for the password, salt, basil and padding
|
||||||
|
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||||
|
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||||
|
|
||||||
|
byte *ptrByte = (byte*) wholeMatrix;
|
||||||
|
|
||||||
|
//Prepends the password
|
||||||
|
memcpy(ptrByte, pwd, pwdlen);
|
||||||
|
ptrByte += pwdlen;
|
||||||
|
|
||||||
|
//Concatenates the salt
|
||||||
|
memcpy(ptrByte, salt, saltlen);
|
||||||
|
ptrByte += saltlen;
|
||||||
|
|
||||||
|
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||||
|
- (saltlen + pwdlen) );
|
||||||
|
|
||||||
|
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||||
|
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = pwdlen;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = saltlen;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = timeCost;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = nRows;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = nCols;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
|
||||||
|
//Now comes the padding
|
||||||
|
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||||
|
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||||
|
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||||
|
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||||
|
|
||||||
|
// from here on it's all simd acces to state and matrix
|
||||||
|
// define vector pointers and adjust sizes and pointer offsets
|
||||||
|
|
||||||
|
//================= Initializing the Sponge State ====================//
|
||||||
|
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||||
|
|
||||||
|
// initState( state );
|
||||||
|
|
||||||
|
//========================= Setup Phase =============================//
|
||||||
|
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||||
|
|
||||||
|
ptrWord = wholeMatrix;
|
||||||
|
|
||||||
|
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||||
|
/*
|
||||||
|
for (i = 0; i < nBlocksInput; i++)
|
||||||
|
{
|
||||||
|
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||||
|
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
//Initializes M[0] and M[1]
|
||||||
|
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||||
|
|
||||||
|
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||||
|
nCols);
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||||
|
|
||||||
|
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||||
|
|
||||||
|
//updates the value of row* (deterministically picked during Setup))
|
||||||
|
rowa = (rowa + step) & (window - 1);
|
||||||
|
//update prev: it now points to the last row ever computed
|
||||||
|
|
||||||
|
prev = row;
|
||||||
|
//updates row: goes to the next row to be computed
|
||||||
|
row++;
|
||||||
|
|
||||||
|
//Checks if all rows in the window where visited.
|
||||||
|
if (rowa == 0)
|
||||||
|
{
|
||||||
|
step = window + gap; //changes the step: approximately doubles its value
|
||||||
|
window *= 2; //doubles the size of the re-visitation window
|
||||||
|
gap = -gap; //inverts the modifier to the step
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (row < nRows);
|
||||||
|
|
||||||
|
//===================== Wandering Phase =============================//
|
||||||
|
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||||
|
for (tau = 1; tau <= timeCost; tau++)
|
||||||
|
{
|
||||||
|
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||||
|
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
//Selects a pseudorandom index row*
|
||||||
|
//-----------------------------------------------
|
||||||
|
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||||
|
|
||||||
|
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||||
|
//-------------------------------------------
|
||||||
|
|
||||||
|
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||||
|
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||||
|
//update prev: it now points to the last row ever computed
|
||||||
|
prev = row;
|
||||||
|
|
||||||
|
//updates row: goes to the next row to be computed
|
||||||
|
//----------------------------------------------------
|
||||||
|
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||||
|
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||||
|
//----------------------------------------------------
|
||||||
|
|
||||||
|
} while (row != 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
//===================== Wrap-up Phase ===============================//
|
||||||
|
//Absorbs the last block of the memory matrix
|
||||||
|
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||||
|
//Squeezes the key
|
||||||
|
squeeze(state, K, (unsigned int) kLen);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// 2 way 256
|
||||||
|
// drop salt, salt len arguments, hard code some others.
|
||||||
|
// Data is interleaved 2x256.
|
||||||
|
|
||||||
|
int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||||
|
const void *pwd, const uint64_t pwdlen, const void *salt,
|
||||||
|
const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
|
||||||
|
const uint64_t nCols )
|
||||||
|
{
|
||||||
|
//====================== Basic variables ============================//
|
||||||
|
uint64_t _ALIGN(256) state[16];
|
||||||
|
int64_t row = 2; //index of row to be processed
|
||||||
|
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||||
|
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||||
|
int64_t tau; //Time Loop iterator
|
||||||
|
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||||
|
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||||
|
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||||
|
// int64_t i; //auxiliary iteration counter
|
||||||
|
int64_t v64; // 64bit var for memcpy
|
||||||
|
uint64_t instance0 = 0; // Seperate instance for each lane
|
||||||
|
uint64_t instance1 = 0;
|
||||||
|
//====================================================================/
|
||||||
|
|
||||||
|
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||||
|
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
|
||||||
|
|
||||||
|
uint64_t *ptrWord = wholeMatrix;
|
||||||
|
|
||||||
|
// 2 way 256 rewrite. Salt always == password, and data is interleaved,
|
||||||
|
// need to build in parallel:
|
||||||
|
// { password, (64 or 80 bytes)
|
||||||
|
// salt, (64 or 80 bytes) = same as password
|
||||||
|
// Klen, (u64) = 32 bytes
|
||||||
|
// pwdlen, (u64)
|
||||||
|
// saltlen, (u64)
|
||||||
|
// timecost, (u64)
|
||||||
|
// nrows, (u64)
|
||||||
|
// ncols, (u64)
|
||||||
|
// 0x80, (byte)
|
||||||
|
// { 0 .. 0 },
|
||||||
|
// 1 (byte)
|
||||||
|
// }
|
||||||
|
|
||||||
|
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||||
|
|
||||||
|
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||||
|
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||||
|
|
||||||
|
byte *ptrByte = (byte*) wholeMatrix;
|
||||||
|
|
||||||
|
//Prepends the password
|
||||||
|
memcpy(ptrByte, pwd, pwdlen);
|
||||||
|
ptrByte += pwdlen;
|
||||||
|
|
||||||
|
//Concatenates the salt
|
||||||
|
memcpy(ptrByte, salt, saltlen);
|
||||||
|
ptrByte += saltlen;
|
||||||
|
|
||||||
|
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||||
|
- (saltlen + pwdlen) );
|
||||||
|
|
||||||
|
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||||
|
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = pwdlen;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = saltlen;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = timeCost;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = nRows;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = nCols;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
|
||||||
|
//Now comes the padding
|
||||||
|
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||||
|
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||||
|
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||||
|
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||||
|
|
||||||
|
// from here on it's all simd acces to state and matrix
|
||||||
|
// define vector pointers and adjust sizes and pointer offsets
|
||||||
|
|
||||||
|
ptrWord = wholeMatrix;
|
||||||
|
|
||||||
|
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||||
|
reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
|
||||||
|
|
||||||
|
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||||
|
nCols);
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
|
||||||
|
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||||
|
|
||||||
|
rowa = (rowa + step) & (window - 1);
|
||||||
|
|
||||||
|
prev = row;
|
||||||
|
row++;
|
||||||
|
|
||||||
|
if (rowa == 0)
|
||||||
|
{
|
||||||
|
step = window + gap; //changes the step: approximately doubles its value
|
||||||
|
window *= 2; //doubles the size of the re-visitation window
|
||||||
|
gap = -gap; //inverts the modifier to the step
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (row < nRows);
|
||||||
|
|
||||||
|
row = 0;
|
||||||
|
for (tau = 1; tau <= timeCost; tau++)
|
||||||
|
{
|
||||||
|
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
// This part is not parallel, rowa will be different for each lane.
|
||||||
|
// state (u64[16]) is interleaved 2x256, need to extract seperately.
|
||||||
|
|
||||||
|
// index = 2 * instance / 4 * 4 + instance % 4
|
||||||
|
uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
|
||||||
|
+ ( instance0 & 0x3 )
|
||||||
|
uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
|
||||||
|
+ ( instance1 & 0x3 )
|
||||||
|
|
||||||
|
instance0 = state[ index0 ] & 0xf;
|
||||||
|
instance1 = (state+4)[ index1 ] & 0xf;
|
||||||
|
|
||||||
|
rowa0 = state[ instance0 ];
|
||||||
|
rowa1 = (state+4)[ instance1 ];
|
||||||
|
|
||||||
|
reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa0*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa1*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||||
|
/*
|
||||||
|
instance = state[instance & 0xF];
|
||||||
|
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
|
||||||
|
|
||||||
|
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||||
|
*/
|
||||||
|
// End of divergence.
|
||||||
|
|
||||||
|
prev = row;
|
||||||
|
row = (row + step) & (unsigned int)(nRows-1);
|
||||||
|
|
||||||
|
} while ( row != 0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
|
||||||
|
squeeze( state, K, (unsigned int) kLen );
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||||
|
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||||
|
const uint64_t timeCost, const uint64_t nRows,
|
||||||
|
const uint64_t nCols )
|
||||||
|
{
|
||||||
|
//========================== Basic variables ============================//
|
||||||
|
uint64_t _ALIGN(256) state[16];
|
||||||
|
int64_t row = 2; //index of row to be processed
|
||||||
|
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||||
|
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||||
|
int64_t tau; //Time Loop iterator
|
||||||
|
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||||
|
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||||
|
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||||
|
// int64_t i; //auxiliary iteration counter
|
||||||
|
//=======================================================================/
|
||||||
|
|
||||||
|
//======= Initializing the Memory Matrix and pointers to it =============//
|
||||||
|
//Tries to allocate enough space for the whole memory matrix
|
||||||
|
|
||||||
|
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||||
|
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||||
|
|
||||||
|
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||||
|
|
||||||
|
//==== Getting the password + salt + basil padded with 10*1 ============//
|
||||||
|
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||||
|
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||||
|
|
||||||
|
//First, we clean enough blocks for the password, salt, basil and padding
|
||||||
|
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
|
||||||
|
sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||||
|
byte *ptrByte = (byte*) wholeMatrix;
|
||||||
|
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
|
||||||
|
|
||||||
|
//Prepends the password
|
||||||
|
memcpy(ptrByte, pwd, pwdlen);
|
||||||
|
ptrByte += pwdlen;
|
||||||
|
|
||||||
|
//Concatenates the salt
|
||||||
|
memcpy(ptrByte, salt, saltlen);
|
||||||
|
ptrByte += saltlen;
|
||||||
|
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||||
|
memcpy(ptrByte, &kLen, sizeof (uint64_t));
|
||||||
|
ptrByte += sizeof (uint64_t);
|
||||||
|
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
|
||||||
|
ptrByte += sizeof (uint64_t);
|
||||||
|
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
|
||||||
|
ptrByte += sizeof (uint64_t);
|
||||||
|
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
|
||||||
|
ptrByte += sizeof (uint64_t);
|
||||||
|
memcpy(ptrByte, &nRows, sizeof (uint64_t));
|
||||||
|
ptrByte += sizeof (uint64_t);
|
||||||
|
memcpy(ptrByte, &nCols, sizeof (uint64_t));
|
||||||
|
ptrByte += sizeof (uint64_t);
|
||||||
|
|
||||||
|
//Now comes the padding
|
||||||
|
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||||
|
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||||
|
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||||
|
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||||
|
|
||||||
|
//=================== Initializing the Sponge State ====================//
|
||||||
|
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||||
|
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
|
||||||
|
// if (state == NULL) {
|
||||||
|
// return -1;
|
||||||
|
// }
|
||||||
|
// initState( state );
|
||||||
|
|
||||||
|
//============================== Setup Phase =============================//
|
||||||
|
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||||
|
uint64_t *ptrWord = wholeMatrix;
|
||||||
|
|
||||||
|
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
|
||||||
|
BLOCK_LEN_BLAKE2_SAFE_INT64 );
|
||||||
|
/*
|
||||||
|
for ( i = 0; i < nBlocksInput; i++ )
|
||||||
|
{
|
||||||
|
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||||
|
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
//Initializes M[0] and M[1]
|
||||||
|
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
|
||||||
|
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
|
||||||
|
|
||||||
|
do {
|
||||||
|
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||||
|
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||||
|
|
||||||
|
//updates the value of row* (deterministically picked during Setup))
|
||||||
|
rowa = (rowa + step) & (window - 1);
|
||||||
|
//update prev: it now points to the last row ever computed
|
||||||
|
prev = row;
|
||||||
|
//updates row: goes to the next row to be computed
|
||||||
|
row++;
|
||||||
|
|
||||||
|
//Checks if all rows in the window where visited.
|
||||||
|
if (rowa == 0) {
|
||||||
|
step = window + gap; //changes the step: approximately doubles its value
|
||||||
|
window *= 2; //doubles the size of the re-visitation window
|
||||||
|
gap = -gap; //inverts the modifier to the step
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (row < nRows);
|
||||||
|
|
||||||
|
//======================== Wandering Phase =============================//
|
||||||
|
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||||
|
for ( tau = 1; tau <= timeCost; tau++ )
|
||||||
|
{
|
||||||
|
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||||
|
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||||
|
do {
|
||||||
|
//Selects a pseudorandom index row*
|
||||||
|
//----------------------------------------------------------------------
|
||||||
|
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||||
|
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||||
|
//-----------------------------------------------------------------
|
||||||
|
|
||||||
|
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||||
|
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||||
|
|
||||||
|
//update prev: it now points to the last row ever computed
|
||||||
|
prev = row;
|
||||||
|
|
||||||
|
//updates row: goes to the next row to be computed
|
||||||
|
//---------------------------------------------------------------
|
||||||
|
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||||
|
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
|
||||||
|
} while (row != 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
//========================= Wrap-up Phase ===============================//
|
||||||
|
//Absorbs the last block of the memory matrix
|
||||||
|
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||||
|
|
||||||
|
//Squeezes the key
|
||||||
|
squeeze( state, K, kLen );
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lyra2RE doesn't like the new wholeMatrix implementation
|
||||||
|
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
||||||
|
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
|
||||||
|
const uint64_t nRows, const uint64_t nCols )
|
||||||
|
{
|
||||||
|
//====================== Basic variables ============================//
|
||||||
|
uint64_t _ALIGN(256) state[16];
|
||||||
|
int64_t row = 2; //index of row to be processed
|
||||||
|
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||||
|
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||||
|
int64_t tau; //Time Loop iterator
|
||||||
|
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||||
|
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||||
|
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||||
|
int64_t i; //auxiliary iteration counter
|
||||||
|
int64_t v64; // 64bit var for memcpy
|
||||||
|
//====================================================================/
|
||||||
|
|
||||||
|
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||||
|
//Tries to allocate enough space for the whole memory matrix
|
||||||
|
|
||||||
|
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||||
|
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||||
|
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||||
|
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||||
|
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||||
|
|
||||||
|
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||||
|
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||||
|
if (wholeMatrix == NULL)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
|
||||||
|
#elif defined(__SSE2__)
|
||||||
|
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
|
||||||
|
#else
|
||||||
|
memset( wholeMatrix, 0, i );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint64_t *ptrWord = wholeMatrix;
|
||||||
|
|
||||||
|
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||||
|
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||||
|
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||||
|
|
||||||
|
//First, we clean enough blocks for the password, salt, basil and padding
|
||||||
|
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||||
|
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||||
|
|
||||||
|
byte *ptrByte = (byte*) wholeMatrix;
|
||||||
|
|
||||||
|
//Prepends the password
|
||||||
|
memcpy(ptrByte, pwd, pwdlen);
|
||||||
|
ptrByte += pwdlen;
|
||||||
|
|
||||||
|
//Concatenates the salt
|
||||||
|
memcpy(ptrByte, salt, saltlen);
|
||||||
|
ptrByte += saltlen;
|
||||||
|
|
||||||
|
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||||
|
// - (saltlen + pwdlen) );
|
||||||
|
|
||||||
|
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||||
|
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = pwdlen;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = saltlen;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = timeCost;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = nRows;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
v64 = nCols;
|
||||||
|
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||||
|
ptrByte += sizeof(uint64_t);
|
||||||
|
|
||||||
|
//Now comes the padding
|
||||||
|
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||||
|
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||||
|
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||||
|
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||||
|
|
||||||
|
//================= Initializing the Sponge State ====================//
|
||||||
|
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||||
|
|
||||||
|
// initState( state );
|
||||||
|
|
||||||
|
//========================= Setup Phase =============================//
|
||||||
|
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||||
|
|
||||||
|
ptrWord = wholeMatrix;
|
||||||
|
|
||||||
|
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||||
|
/*
|
||||||
|
for (i = 0; i < nBlocksInput; i++)
|
||||||
|
{
|
||||||
|
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||||
|
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
//Initializes M[0] and M[1]
|
||||||
|
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||||
|
|
||||||
|
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||||
|
nCols);
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||||
|
|
||||||
|
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||||
|
|
||||||
|
//updates the value of row* (deterministically picked during Setup))
|
||||||
|
rowa = (rowa + step) & (window - 1);
|
||||||
|
//update prev: it now points to the last row ever computed
|
||||||
|
|
||||||
|
prev = row;
|
||||||
|
//updates row: goes to the next row to be computed
|
||||||
|
row++;
|
||||||
|
|
||||||
|
//Checks if all rows in the window where visited.
|
||||||
|
if (rowa == 0)
|
||||||
|
{
|
||||||
|
step = window + gap; //changes the step: approximately doubles its value
|
||||||
|
window *= 2; //doubles the size of the re-visitation window
|
||||||
|
gap = -gap; //inverts the modifier to the step
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (row < nRows);
|
||||||
|
|
||||||
|
//===================== Wandering Phase =============================//
|
||||||
|
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||||
|
for (tau = 1; tau <= timeCost; tau++)
|
||||||
|
{
|
||||||
|
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||||
|
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
//Selects a pseudorandom index row*
|
||||||
|
//-----------------------------------------------
|
||||||
|
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||||
|
|
||||||
|
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||||
|
//-------------------------------------------
|
||||||
|
|
||||||
|
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||||
|
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||||
|
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||||
|
//update prev: it now points to the last row ever computed
|
||||||
|
prev = row;
|
||||||
|
|
||||||
|
//updates row: goes to the next row to be computed
|
||||||
|
//----------------------------------------------------
|
||||||
|
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||||
|
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||||
|
//----------------------------------------------------
|
||||||
|
|
||||||
|
} while (row != 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
//===================== Wrap-up Phase ===============================//
|
||||||
|
//Absorbs the last block of the memory matrix
|
||||||
|
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||||
|
//Squeezes the key
|
||||||
|
squeeze(state, K, (unsigned int) kLen);
|
||||||
|
|
||||||
|
//================== Freeing the memory =============================//
|
||||||
|
_mm_free(wholeMatrix);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
319
algo/lyra2/sponge-2way.c
Normal file
319
algo/lyra2/sponge-2way.c
Normal file
@@ -0,0 +1,319 @@
|
|||||||
|
/**
|
||||||
|
* A simple implementation of Blake2b's internal permutation
|
||||||
|
* in the form of a sponge.
|
||||||
|
*
|
||||||
|
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||||
|
*
|
||||||
|
* This software is hereby placed in the public domain.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||||
|
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||||
|
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||||
|
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||||
|
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||||
|
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "algo-gate.h"
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include "sponge.h"
|
||||||
|
#include "lyra2.h"
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||||
|
{
|
||||||
|
const int len_m256i = len / 32;
|
||||||
|
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
|
||||||
|
__m512i* state = (__m512i*)State;
|
||||||
|
__m512i* out = (__m512i*)Out;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
//Squeezes full blocks
|
||||||
|
for ( i = 0; i < fullBlocks; i++ )
|
||||||
|
{
|
||||||
|
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
|
||||||
|
LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
|
||||||
|
out += BLOCK_LEN_M256I*2;
|
||||||
|
}
|
||||||
|
//Squeezes remaining bytes
|
||||||
|
memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
|
||||||
|
{
|
||||||
|
register __m512i state0, state1, state2, state3;
|
||||||
|
__m512i *in = (__m512i*)In;
|
||||||
|
|
||||||
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
|
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||||
|
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||||
|
|
||||||
|
state0 = _mm512_xor_si512( state0, in[0] );
|
||||||
|
state1 = _mm512_xor_si512( state1, in[1] );
|
||||||
|
state2 = _mm512_xor_si512( state2, in[2] );
|
||||||
|
|
||||||
|
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
|
||||||
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
||||||
|
const uint64_t nBlocks, const uint64_t block_len )
|
||||||
|
{
|
||||||
|
register __m512i state0, state1, state2, state3;
|
||||||
|
|
||||||
|
state0 =
|
||||||
|
state1 = m512_zero;
|
||||||
|
state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||||
|
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||||
|
state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||||
|
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||||
|
|
||||||
|
for ( int i = 0; i < nBlocks; i++ )
|
||||||
|
{
|
||||||
|
__m512i *in = (__m512i*)In;
|
||||||
|
state0 = _mm512_xor_si512( state0, in[0] );
|
||||||
|
state1 = _mm512_xor_si512( state1, in[1] );
|
||||||
|
|
||||||
|
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
In += block_len * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||||
|
uint64_t nCols )
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
//M[row][C-1-col] = H.reduced_squeeze()
|
||||||
|
|
||||||
|
|
||||||
|
register __m512i state0, state1, state2, state3;
|
||||||
|
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||||
|
|
||||||
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
|
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||||
|
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||||
|
|
||||||
|
for ( i = 0; i < 9; i += 3)
|
||||||
|
{
|
||||||
|
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( i = 0; i < nCols; i++ )
|
||||||
|
{
|
||||||
|
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||||
|
|
||||||
|
out[0] = state0;
|
||||||
|
out[1] = state1;
|
||||||
|
out[2] = state2;
|
||||||
|
|
||||||
|
//Goes to next block (column) that will receive the squeezed data
|
||||||
|
out -= BLOCK_LEN_M256I * 2;
|
||||||
|
|
||||||
|
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function has to deal with gathering 2 256 bit rowin vectors from
|
||||||
|
// non-contiguous memory. Extra work and performance penalty.
|
||||||
|
|
||||||
|
inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||||
|
uint64_t *rowOut, uint64_t nCols )
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
register __m512i state0, state1, state2, state3;
|
||||||
|
__m512i *in = (__m256i*)rowIn;
|
||||||
|
|
||||||
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
|
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||||
|
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||||
|
|
||||||
|
for ( i = 0; i < nCols; i++ )
|
||||||
|
{
|
||||||
|
state0 = _mm512_xor_si512( state0, in[0] );
|
||||||
|
state1 = _mm512_xor_si512( state1, in[1] );
|
||||||
|
state2 = _mm512_xor_si512( state2, in[2] );
|
||||||
|
|
||||||
|
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
|
||||||
|
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||||
|
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||||
|
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||||
|
|
||||||
|
//Input: next column (i.e., next block in sequence)
|
||||||
|
in0 += BLOCK_LEN_M256I;
|
||||||
|
in1 += BLOCK_LEN_M256I;
|
||||||
|
//Output: goes to previous column
|
||||||
|
out -= BLOCK_LEN_M256I * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm512_store_si256( (__m512i*)State, state0 );
|
||||||
|
_mm512_store_si256( (__m512i*)State + 1, state1 );
|
||||||
|
_mm512_store_si256( (__m512i*)State + 2, state2 );
|
||||||
|
_mm512_store_si256( (__m512i*)State + 3, state3 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||||
|
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
register __m512i state0, state1, state2, state3;
|
||||||
|
__m512i* in = (__m512i*)rowIn;
|
||||||
|
__m512i* inout = (__m512i*)rowInOut;
|
||||||
|
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||||
|
__m512i t0, t1, t2;
|
||||||
|
|
||||||
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
|
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||||
|
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||||
|
|
||||||
|
for ( i = 0; i < nCols; i++ )
|
||||||
|
{
|
||||||
|
state0 = _mm512_xor_si512( state0,
|
||||||
|
_mm512_add_epi64( in[0], inout[0] ) );
|
||||||
|
state1 = _mm512_xor_si512( state1,
|
||||||
|
_mm512_add_epi64( in[1], inout[1] ) );
|
||||||
|
state2 = _mm512_xor_si512( state2,
|
||||||
|
_mm512_add_epi64( in[2], inout[2] ) );
|
||||||
|
|
||||||
|
LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
|
||||||
|
|
||||||
|
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||||
|
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||||
|
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||||
|
|
||||||
|
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||||
|
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||||
|
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||||
|
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||||
|
|
||||||
|
inout[0] = _mm512_xor_si512( inout[0],
|
||||||
|
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||||
|
inout[1] = _mm512_xor_si512( inout[1],
|
||||||
|
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||||
|
inout[2] = _mm512_xor_si512( inout[2],
|
||||||
|
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||||
|
|
||||||
|
//Inputs: next column (i.e., next block in sequence)
|
||||||
|
in += BLOCK_LEN_M256I * 2;
|
||||||
|
inout += BLOCK_LEN_M256I * 2;
|
||||||
|
//Output: goes to previous column
|
||||||
|
out -= BLOCK_LEN_M256I * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
|
||||||
|
uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
|
||||||
|
uint64_t nCols )
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
register __m512i state0, state1, state2, state3;
|
||||||
|
__m256i *in0 = (__m256i*)rowIn0;
|
||||||
|
__m256i *in0 = (__m256i*)rowIn0;
|
||||||
|
__m2512* in = (__m512i*)rowIn;
|
||||||
|
__m2512* inout = (__m512i*)rowInOut;
|
||||||
|
__m512i* out = (__m512i*)rowOut;
|
||||||
|
__m512i t0, t1, t2;
|
||||||
|
|
||||||
|
_mm_prefetch( in0, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in1, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in0 + 2, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in1 + 2, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in0 + 4, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in1 + 4, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in0 + 6, _MM_HINT_T0 );
|
||||||
|
_mm_prefetch( in1 + 6, _MM_HINT_T0 );
|
||||||
|
|
||||||
|
state0 = _mm512_load_si512( (__m512i*)State );
|
||||||
|
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||||
|
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||||
|
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||||
|
|
||||||
|
//Absorbing "M[prev] [+] M[row*]"
|
||||||
|
|
||||||
|
// state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
|
||||||
|
// state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
|
||||||
|
// state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
|
||||||
|
t0 = mm512_concat_256( in1[0], in0[0] );
|
||||||
|
t1 = mm512_concat_256( in1[1], in0[1] );
|
||||||
|
t2 = mm512_concat_256( in1[2], in0[2] );
|
||||||
|
|
||||||
|
state0 = _mm512_xor_si512( state0,
|
||||||
|
_mm512_add_epi64( t0, inout[0] ) );
|
||||||
|
state1 = _mm512_xor_si512( state1,
|
||||||
|
_mm512_add_epi64( t1, inout[1] ) );
|
||||||
|
state2 = _mm512_xor_si512( state2,
|
||||||
|
_mm512_add_epi64( t2, inout[2] ) );
|
||||||
|
|
||||||
|
//Applies the reduced-round transformation f to the sponge's state
|
||||||
|
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||||
|
|
||||||
|
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||||
|
out[0] = _mm512_xor_si512( out[0], state0 );
|
||||||
|
out[1] = _mm512_xor_si512( out[1], state1 );
|
||||||
|
out[2] = _mm512_xor_si512( out[2], state2 );
|
||||||
|
|
||||||
|
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||||
|
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||||
|
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||||
|
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||||
|
|
||||||
|
inout[0] = _mm512_xor_si512( inout[0],
|
||||||
|
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||||
|
inout[1] = _mm512_xor_si512( inout[1],
|
||||||
|
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||||
|
inout[2] = _mm512_xor_si512( inout[2],
|
||||||
|
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||||
|
|
||||||
|
//Goes to next block
|
||||||
|
in += BLOCK_LEN_M256I * 2;
|
||||||
|
out += BLOCK_LEN_M256I * 2;
|
||||||
|
inout += BLOCK_LEN_M256I * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm512_store_si512( (__m512i*)State, state0 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||||
|
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // AVX512
|
@@ -52,8 +52,46 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
// However, 2 way parallel looks trivial to code for AVX512 except for
|
// However, 2 way parallel looks trivial to code for AVX512 except for
|
||||||
// a data dependency with rowa.
|
// a data dependency with rowa.
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
#define G2W_4X64(a,b,c,d) \
|
||||||
|
a = _mm512_add_epi64( a, b ); \
|
||||||
|
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||||
|
c = _mm512_add_epi64( c, d ); \
|
||||||
|
b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
|
||||||
|
a = _mm512_add_epi64( a, b ); \
|
||||||
|
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||||
|
c = _mm512_add_epi64( c, d ); \
|
||||||
|
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
|
||||||
|
|
||||||
|
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
G_4X64( s0, s1, s2, s3 ); \
|
||||||
|
s1 = mm512_ror_1x64( s1); \
|
||||||
|
s2 = mm512_swap128_256( s2 ); \
|
||||||
|
s3 = mm512_rol1x64_256( s3 ); \
|
||||||
|
G_4X64( s0, s1, s2, s3 ); \
|
||||||
|
s1 = mm512_rol1x64_256( s1 ); \
|
||||||
|
s2 = mm512_swap128_256( s2 ); \
|
||||||
|
s3 = mm512_ror1x64_256( s3 );
|
||||||
|
|
||||||
|
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||||
|
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
|
||||||
|
|
||||||
|
|
||||||
|
#endif // AVX512
|
||||||
|
|
||||||
#if defined __AVX2__
|
#if defined __AVX2__
|
||||||
// only available with avx2
|
|
||||||
|
|
||||||
// process 4 columns in parallel
|
// process 4 columns in parallel
|
||||||
// returns void, updates all args
|
// returns void, updates all args
|
||||||
@@ -89,9 +127,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 )
|
||||||
|
|
||||||
#elif defined(__SSE2__)
|
#endif
|
||||||
|
|
||||||
|
#if defined(__SSE2__)
|
||||||
|
|
||||||
// process 2 columns in parallel
|
// process 2 columns in parallel
|
||||||
// returns void, all args updated
|
// returns void, all args updated
|
||||||
@@ -129,7 +169,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)
|
||||||
|
|
||||||
|
|
||||||
#endif // AVX2 else SSE2
|
#endif // AVX2 else SSE2
|
||||||
@@ -161,6 +201,30 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
|
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
//---- Housekeeping
|
||||||
|
void initState_2way( uint64_t state[/*16*/] );
|
||||||
|
|
||||||
|
//---- Squeezes
|
||||||
|
void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
|
||||||
|
void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
|
||||||
|
|
||||||
|
//---- Absorbs
|
||||||
|
void absorbBlock_2way( uint64_t *state, const uint64_t *in );
|
||||||
|
void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
|
||||||
|
const uint64_t nBlocks, const uint64_t block_len );
|
||||||
|
|
||||||
|
//---- Duplexes
|
||||||
|
void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
|
||||||
|
uint64_t *rowOut, uint64_t nCols);
|
||||||
|
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
|
||||||
|
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||||
|
void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
//---- Housekeeping
|
//---- Housekeeping
|
||||||
void initState(uint64_t state[/*16*/]);
|
void initState(uint64_t state[/*16*/]);
|
||||||
|
|
||||||
@@ -178,20 +242,4 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint6
|
|||||||
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||||
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||||
|
|
||||||
//---- Misc
|
|
||||||
void printArray(unsigned char *array, unsigned int size, char *name);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
|
|
||||||
////TESTS////
|
|
||||||
//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
|
||||||
//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
|
||||||
//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
|
|
||||||
//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
|
||||||
//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
|
||||||
//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
|
||||||
/////////////
|
|
||||||
|
|
||||||
|
|
||||||
#endif /* SPONGE_H_ */
|
#endif /* SPONGE_H_ */
|
||||||
|
@@ -5,7 +5,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX2__) && defined(__AES__)
|
||||||
// #define HMQ1725_4WAY
|
// #define HMQ1725_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_hmq1725_algo( algo_gate_t* gate );
|
bool register_hmq1725_algo( algo_gate_t* gate );
|
||||||
|
@@ -4,7 +4,8 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
@@ -13,73 +14,70 @@
|
|||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
luffa_4way_context luffa;
|
luffa_4way_context luffa;
|
||||||
cubehashParam cube;
|
cube_4way_context cube;
|
||||||
sph_shavite512_context shavite;
|
sph_shavite512_context shavite;
|
||||||
simd_4way_context simd;
|
simd_4way_context simd;
|
||||||
hashState_echo echo;
|
simd_2way_context simd2;
|
||||||
|
hashState_echo echo;
|
||||||
} qubit_4way_ctx_holder;
|
} qubit_4way_ctx_holder;
|
||||||
|
|
||||||
qubit_4way_ctx_holder qubit_4way_ctx;
|
qubit_4way_ctx_holder qubit_4way_ctx;
|
||||||
|
|
||||||
void init_qubit_4way_ctx()
|
void init_qubit_4way_ctx()
|
||||||
{
|
{
|
||||||
cubehashInit(&qubit_4way_ctx.cube,512,16,32);
|
cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
|
||||||
sph_shavite512_init(&qubit_4way_ctx.shavite);
|
sph_shavite512_init(&qubit_4way_ctx.shavite);
|
||||||
simd_4way_init( &qubit_4way_ctx.simd, 512 );
|
simd_4way_init( &qubit_4way_ctx.simd, 512 );
|
||||||
init_echo(&qubit_4way_ctx.echo, 512);
|
simd_2way_init( &qubit_4way_ctx.simd2, 512 );
|
||||||
|
init_echo(&qubit_4way_ctx.echo, 512);
|
||||||
};
|
};
|
||||||
|
|
||||||
void qubit_4way_hash( void *output, const void *input )
|
void qubit_4way_hash( void *output, const void *input )
|
||||||
{
|
{
|
||||||
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
uint32_t vhash[16*4] __attribute__ ((aligned (128)));
|
||||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||||
qubit_4way_ctx_holder ctx;
|
qubit_4way_ctx_holder ctx;
|
||||||
|
|
||||||
memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
|
memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
|
||||||
|
|
||||||
luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
|
luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
|
||||||
luffa_4way_close( &ctx.luffa, vhash );
|
luffa_4way_close( &ctx.luffa, vhash );
|
||||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
|
||||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
|
||||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
|
||||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
|
||||||
|
|
||||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
|
||||||
sizeof(sph_shavite512_context) );
|
sizeof(sph_shavite512_context) );
|
||||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
|
||||||
sizeof(sph_shavite512_context) );
|
sizeof(sph_shavite512_context) );
|
||||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
|
||||||
sizeof(sph_shavite512_context) );
|
sizeof(sph_shavite512_context) );
|
||||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
|
||||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
(const BitSequence *) hash0, 512 );
|
(const BitSequence *) hash0, 512 );
|
||||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
(const BitSequence *) hash1, 512 );
|
(const BitSequence *) hash1, 512 );
|
||||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
(const BitSequence *) hash2, 512 );
|
(const BitSequence *) hash2, 512 );
|
||||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
(const BitSequence *) hash3, 512 );
|
(const BitSequence *) hash3, 512 );
|
||||||
|
|
||||||
@@ -92,71 +90,40 @@ void qubit_4way_hash( void *output, const void *input )
|
|||||||
int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
|
int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr )
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
{
|
{
|
||||||
uint32_t hash[4*16] __attribute__ ((aligned (128)));
|
uint32_t hash[4*8] __attribute__ ((aligned (128)));
|
||||||
uint32_t vdata[4*24] __attribute__ ((aligned (64)));
|
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||||
int thr_id = mythr->id;
|
int thr_id = mythr->id;
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
|
||||||
0xFFF, 0xFFFF, 0x10000000 };
|
|
||||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
|
||||||
0xFFFFF000, 0xFFFF0000, 0 };
|
|
||||||
|
|
||||||
casti_m512i( endiandata, 0 ) = mm512_bswap_32( casti_m512i( pdata, 0 ) );
|
|
||||||
casti_m512i( endiandata, 1 ) = mm512_bswap_32( casti_m512i( pdata, 1 ) );
|
|
||||||
casti_m512i( endiandata, 4 ) = mm512_bswap_32( casti_m512i( pdata, 4 ) );
|
|
||||||
|
|
||||||
uint64_t *edata = (uint64_t*)endiandata;
|
|
||||||
intrlv_4x128( (uint64_t*)vdata, edata, edata, 640 );
|
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_4x128( vdata, pdata );
|
||||||
luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
|
luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
|
||||||
luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
|
luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
|
||||||
|
|
||||||
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
do
|
||||||
{
|
{
|
||||||
uint32_t mask = masks[m];
|
be32enc( noncep, n );
|
||||||
do
|
be32enc( noncep+ 4, n+1 );
|
||||||
{
|
be32enc( noncep+ 8, n+2 );
|
||||||
be32enc( noncep, n );
|
be32enc( noncep+12, n+3 );
|
||||||
be32enc( noncep+4, n+1 );
|
|
||||||
be32enc( noncep+8, n+2 );
|
|
||||||
be32enc( noncep+12, n+3 );
|
|
||||||
qubit_4way_hash( hash, vdata );
|
|
||||||
pdata[19] = n;
|
|
||||||
|
|
||||||
if ( !( hash[7] & mask ) )
|
qubit_4way_hash( hash, vdata );
|
||||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
pdata[19] = n;
|
||||||
{
|
|
||||||
pdata[19] = n;
|
for ( int lane = 0; lane < 4; lane++ )
|
||||||
submit_lane_solution( work, hash, mythr, 0 );
|
if ( ( hash+(lane<<3) )[7] < Htarg )
|
||||||
}
|
if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
|
||||||
if ( !( (hash+8)[7] & mask ) )
|
{
|
||||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
pdata[19] = n + lane;
|
||||||
{
|
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||||
pdata[19] = n+1;
|
}
|
||||||
submit_lane_solution( work, hash+8, mythr, 1 );
|
n += 4;
|
||||||
}
|
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
||||||
if ( !( hash+16[7] & mask ) )
|
|
||||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
|
||||||
{
|
|
||||||
pdata[19] = n+2;
|
|
||||||
submit_lane_solution( work, hash, mythr, 2 );
|
|
||||||
}
|
|
||||||
if ( !( (hash+24)[7] & mask ) )
|
|
||||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
|
||||||
{
|
|
||||||
pdata[19] = n+3;
|
|
||||||
submit_lane_solution( work, hash+8, mythr, 3 );
|
|
||||||
}
|
|
||||||
n += 4;
|
|
||||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
*hashes_done = n - first_nonce;
|
*hashes_done = n - first_nonce;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@@ -2,14 +2,12 @@
|
|||||||
|
|
||||||
bool register_qubit_algo( algo_gate_t* gate )
|
bool register_qubit_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
#if defined (QUBIT_4WAY)
|
#if defined (QUBIT_4WAY)
|
||||||
init_qubit_2way_ctx();
|
init_qubit_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_qubit_4way;
|
gate->scanhash = (void*)&scanhash_qubit_4way;
|
||||||
gate->hash = (void*)&qubit_4way_hash;
|
gate->hash = (void*)&qubit_4way_hash;
|
||||||
#elif defined (QUBIT_4WAY)
|
#elif defined (QUBIT_2WAY)
|
||||||
*/
|
|
||||||
#if defined (QUBIT_2WAY)
|
|
||||||
init_qubit_2way_ctx();
|
init_qubit_2way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_qubit_2way;
|
gate->scanhash = (void*)&scanhash_qubit_2way;
|
||||||
gate->hash = (void*)&qubit_2way_hash;
|
gate->hash = (void*)&qubit_2way_hash;
|
||||||
@@ -18,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_qubit;
|
gate->scanhash = (void*)&scanhash_qubit;
|
||||||
gate->hash = (void*)&qubit_hash;
|
gate->hash = (void*)&qubit_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,17 +4,15 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
/*
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define QUBIT_2WAY 1
|
#define QUBIT_4WAY 1
|
||||||
#elif defined(__AVX2__) && defined(__AES__)
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
*/
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
|
||||||
#define QUBIT_2WAY 1
|
#define QUBIT_2WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_qubit_algo( algo_gate_t* gate );
|
bool register_qubit_algo( algo_gate_t* gate );
|
||||||
/*
|
|
||||||
#if defined(QUBIT_4WAY)
|
#if defined(QUBIT_4WAY)
|
||||||
|
|
||||||
void qubit_4way_hash( void *state, const void *input );
|
void qubit_4way_hash( void *state, const void *input );
|
||||||
@@ -23,8 +21,6 @@ int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
|
|||||||
void init_qubit_4way_ctx();
|
void init_qubit_4way_ctx();
|
||||||
|
|
||||||
#elif defined(QUBIT_2WAY)
|
#elif defined(QUBIT_2WAY)
|
||||||
*/
|
|
||||||
#if defined(QUBIT_2WAY)
|
|
||||||
|
|
||||||
void qubit_2way_hash( void *state, const void *input );
|
void qubit_2way_hash( void *state, const void *input );
|
||||||
int scanhash_qubit_2way( struct work *work, uint32_t max_nonce,
|
int scanhash_qubit_2way( struct work *work, uint32_t max_nonce,
|
||||||
|
@@ -735,7 +735,7 @@ do { \
|
|||||||
fft128_4way( a+512 );
|
fft128_4way( a+512 );
|
||||||
}
|
}
|
||||||
|
|
||||||
#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
#define c1_16_512( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||||
|
|
||||||
void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||||
{
|
{
|
||||||
@@ -744,8 +744,12 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
|||||||
__m512i *S = (__m512i*) state;
|
__m512i *S = (__m512i*) state;
|
||||||
__m512i *M = (__m512i*) msg;
|
__m512i *M = (__m512i*) msg;
|
||||||
__m512i *W = (__m512i*) fft;
|
__m512i *W = (__m512i*) fft;
|
||||||
static const m512_v16 code[] = { c1_16(185), c1_16(233),
|
|
||||||
c1_16(185), c1_16(233) };
|
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
|
||||||
|
|
||||||
|
|
||||||
|
// static const m512_v16 code[] = { c1_16(185), c1_16(233),
|
||||||
|
// c1_16(185), c1_16(233) };
|
||||||
|
|
||||||
|
|
||||||
S0l = _mm512_xor_si512( S[0], M[0] );
|
S0l = _mm512_xor_si512( S[0], M[0] );
|
||||||
@@ -999,7 +1003,9 @@ void SIMD_4way_Compress( simd_4way_context *state, const void *m, int final )
|
|||||||
{
|
{
|
||||||
m512_v16 Y[32];
|
m512_v16 Y[32];
|
||||||
uint16_t *y = (uint16_t*) Y[0].u16;
|
uint16_t *y = (uint16_t*) Y[0].u16;
|
||||||
|
|
||||||
fft256_4way_msg( y, m, final );
|
fft256_4way_msg( y, m, final );
|
||||||
|
|
||||||
rounds512_4way( state->A, m, y );
|
rounds512_4way( state->A, m, y );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1340,7 +1346,8 @@ do { \
|
|||||||
DO_REDUCE_FULL_S( 6 );
|
DO_REDUCE_FULL_S( 6 );
|
||||||
DO_REDUCE_FULL_S( 7 );
|
DO_REDUCE_FULL_S( 7 );
|
||||||
|
|
||||||
#undef BUTTERFLY
|
#undef BUTTERFLY_0
|
||||||
|
#undef BUTTERFLY_N
|
||||||
#undef DO_REDUCE
|
#undef DO_REDUCE
|
||||||
|
|
||||||
A[0] = X0;
|
A[0] = X0;
|
||||||
@@ -1491,6 +1498,7 @@ do { \
|
|||||||
|
|
||||||
fft128_2way( a );
|
fft128_2way( a );
|
||||||
fft128_2way( a+256 );
|
fft128_2way( a+256 );
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||||
@@ -1751,7 +1759,9 @@ void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
|
|||||||
{
|
{
|
||||||
m256_v16 Y[32];
|
m256_v16 Y[32];
|
||||||
uint16_t *y = (uint16_t*) Y[0].u16;
|
uint16_t *y = (uint16_t*) Y[0].u16;
|
||||||
|
|
||||||
fft256_2way_msg( y, m, final );
|
fft256_2way_msg( y, m, final );
|
||||||
|
|
||||||
rounds512_2way( state->A, m, y );
|
rounds512_2way( state->A, m, y );
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1864,6 +1874,7 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
|
|||||||
{
|
{
|
||||||
// We can hash the data directly from the input buffer.
|
// We can hash the data directly from the input buffer.
|
||||||
SIMD_2way_Compress( state, data, 0 );
|
SIMD_2way_Compress( state, data, 0 );
|
||||||
|
|
||||||
databitlen -= bs;
|
databitlen -= bs;
|
||||||
data += 2*( bs/8 );
|
data += 2*( bs/8 );
|
||||||
state->count += bs;
|
state->count += bs;
|
||||||
@@ -1874,7 +1885,8 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
|
|||||||
int len = bs - current;
|
int len = bs - current;
|
||||||
if ( databitlen < len )
|
if ( databitlen < len )
|
||||||
{
|
{
|
||||||
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
|
|
||||||
|
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
|
||||||
state->count += databitlen;
|
state->count += databitlen;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@@ -1,11 +1,7 @@
|
|||||||
#include "cpuminer-config.h"
|
#include "cpuminer-config.h"
|
||||||
#include "c11-gate.h"
|
#include "c11-gate.h"
|
||||||
|
|
||||||
#if defined (C11_4WAY)
|
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "algo/blake/blake-hash-4way.h"
|
#include "algo/blake/blake-hash-4way.h"
|
||||||
#include "algo/bmw/bmw-hash-4way.h"
|
#include "algo/bmw/bmw-hash-4way.h"
|
||||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||||
@@ -13,11 +9,237 @@
|
|||||||
#include "algo/jh/jh-hash-4way.h"
|
#include "algo/jh/jh-hash-4way.h"
|
||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
|
|
||||||
|
#if defined (C11_8WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
} c11_8way_ctx_holder;
|
||||||
|
|
||||||
|
c11_8way_ctx_holder c11_8way_ctx;
|
||||||
|
|
||||||
|
void init_c11_8way_ctx()
|
||||||
|
{
|
||||||
|
blake512_8way_init( &c11_8way_ctx.blake );
|
||||||
|
bmw512_8way_init( &c11_8way_ctx.bmw );
|
||||||
|
init_groestl( &c11_8way_ctx.groestl, 64 );
|
||||||
|
skein512_8way_init( &c11_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &c11_8way_ctx.jh );
|
||||||
|
keccak512_8way_init( &c11_8way_ctx.keccak );
|
||||||
|
luffa_4way_init( &c11_8way_ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_shavite512_init( &c11_8way_ctx.shavite );
|
||||||
|
simd_4way_init( &c11_8way_ctx.simd, 512 );
|
||||||
|
init_echo( &c11_8way_ctx.echo, 512 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void c11_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
c11_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
|
||||||
|
|
||||||
|
// 1 Blake 4way
|
||||||
|
blake512_8way_update( &ctx.blake, input, 80 );
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
// 2 Bmw
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
|
||||||
|
// Serial
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// 3 Groestl
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
|
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
|
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
|
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
|
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
|
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
|
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
|
// 4way
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
|
||||||
|
// 4 JH
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
// 5 Keccak
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
|
// 6 Skein
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
// Serial
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// 7 Luffa + 8 cube
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
// 9 Shavite
|
||||||
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
|
||||||
|
// 10 Simd
|
||||||
|
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||||
|
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
|
||||||
|
|
||||||
|
// 11 Echo
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence *) hash0, 512 );
|
||||||
|
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
memcpy( state, hash0, 32 );
|
||||||
|
memcpy( state+ 32, hash1, 32 );
|
||||||
|
memcpy( state+ 64, hash2, 32 );
|
||||||
|
memcpy( state+ 96, hash3, 32 );
|
||||||
|
memcpy( state+128, hash4, 32 );
|
||||||
|
memcpy( state+160, hash5, 32 );
|
||||||
|
memcpy( state+192, hash6, 32 );
|
||||||
|
memcpy( state+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
|
||||||
|
max_nonce -= 8;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
c11_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( ( ( hash+(i<<3) )[7] < Htarg )
|
||||||
|
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined (C11_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
bmw512_4way_context bmw;
|
bmw512_4way_context bmw;
|
||||||
|
@@ -2,7 +2,11 @@
|
|||||||
|
|
||||||
bool register_c11_algo( algo_gate_t* gate )
|
bool register_c11_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (C11_4WAY)
|
#if defined (C11_8WAY)
|
||||||
|
init_c11_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_c11_8way;
|
||||||
|
gate->hash = (void*)&c11_8way_hash;
|
||||||
|
#elif defined (C11_4WAY)
|
||||||
init_c11_4way_ctx();
|
init_c11_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_c11_4way;
|
gate->scanhash = (void*)&scanhash_c11_4way;
|
||||||
gate->hash = (void*)&c11_4way_hash;
|
gate->hash = (void*)&c11_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_c11;
|
gate->scanhash = (void*)&scanhash_c11;
|
||||||
gate->hash = (void*)&c11_hash;
|
gate->hash = (void*)&c11_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,29 +4,36 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define C11_4WAY
|
#define C11_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define C11_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
bool register_c11_algo( algo_gate_t* gate );
|
bool register_c11_algo( algo_gate_t* gate );
|
||||||
|
#if defined(C11_8WAY)
|
||||||
|
|
||||||
#if defined(C11_4WAY)
|
void c11_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_c11_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(C11_4WAY)
|
||||||
|
|
||||||
void c11_4way_hash( void *state, const void *input );
|
void c11_4way_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_c11_4way_ctx();
|
void init_c11_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void c11_hash( void *state, const void *input );
|
void c11_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_c11( struct work *work, uint32_t max_nonce,
|
int scanhash_c11( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_c11_ctx();
|
void init_c11_ctx();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
@@ -1,8 +1,5 @@
|
|||||||
#include "cpuminer-config.h"
|
#include "cpuminer-config.h"
|
||||||
#include "x11-gate.h"
|
#include "x11-gate.h"
|
||||||
|
|
||||||
#if defined (X11_4WAY)
|
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "algo/blake/blake-hash-4way.h"
|
#include "algo/blake/blake-hash-4way.h"
|
||||||
@@ -12,11 +9,235 @@
|
|||||||
#include "algo/jh/jh-hash-4way.h"
|
#include "algo/jh/jh-hash-4way.h"
|
||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
|
|
||||||
|
#if defined (X11_8WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
} x11_8way_ctx_holder;
|
||||||
|
|
||||||
|
x11_8way_ctx_holder x11_8way_ctx;
|
||||||
|
|
||||||
|
void init_x11_8way_ctx()
|
||||||
|
{
|
||||||
|
blake512_8way_init( &x11_8way_ctx.blake );
|
||||||
|
bmw512_8way_init( &x11_8way_ctx.bmw );
|
||||||
|
init_groestl( &x11_8way_ctx.groestl, 64 );
|
||||||
|
skein512_8way_init( &x11_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &x11_8way_ctx.jh );
|
||||||
|
keccak512_8way_init( &x11_8way_ctx.keccak );
|
||||||
|
luffa_4way_init( &x11_8way_ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_shavite512_init( &x11_8way_ctx.shavite );
|
||||||
|
simd_4way_init( &x11_8way_ctx.simd, 512 );
|
||||||
|
init_echo( &x11_8way_ctx.echo, 512 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x11_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
x11_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
|
||||||
|
blake512_8way_update( &ctx.blake, input, 80 );
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
|
||||||
|
// Serial
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
|
// 4way
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// Luffa + Cube
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence *) hash0, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
memcpy( state, hash0, 32 );
|
||||||
|
memcpy( state+ 32, hash1, 32 );
|
||||||
|
memcpy( state+ 64, hash2, 32 );
|
||||||
|
memcpy( state+ 96, hash3, 32 );
|
||||||
|
memcpy( state+128, hash4, 32 );
|
||||||
|
memcpy( state+160, hash5, 32 );
|
||||||
|
memcpy( state+192, hash6, 32 );
|
||||||
|
memcpy( state+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
|
||||||
|
const uint32_t last_nonce = max_nonce -8;
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x11_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( ( hash+(i<<3) )[7] < Htarg
|
||||||
|
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif defined (X11_4WAY)
|
||||||
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
bmw512_4way_context bmw;
|
bmw512_4way_context bmw;
|
||||||
|
@@ -1,8 +1,12 @@
|
|||||||
#include "x11-gate.h"
|
#include "x11-gate.h"
|
||||||
|
|
||||||
bool register_x11_algo( algo_gate_t* gate )
|
bool register_x11_algo( algo_gate_t *gate )
|
||||||
{
|
{
|
||||||
#if defined (X11_4WAY)
|
#if defined (X11_8WAY)
|
||||||
|
init_x11_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_x11_8way;
|
||||||
|
gate->hash = (void*)&x11_8way_hash;
|
||||||
|
#elif defined (X11_4WAY)
|
||||||
init_x11_4way_ctx();
|
init_x11_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_x11_4way;
|
gate->scanhash = (void*)&scanhash_x11_4way;
|
||||||
gate->hash = (void*)&x11_4way_hash;
|
gate->hash = (void*)&x11_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_x11_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_x11;
|
gate->scanhash = (void*)&scanhash_x11;
|
||||||
gate->hash = (void*)&x11_hash;
|
gate->hash = (void*)&x11_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,29 +4,35 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X11_4WAY
|
#define X11_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define X11_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_x11_algo( algo_gate_t* gate );
|
bool register_x11_algo( algo_gate_t* gate );
|
||||||
|
#if defined(X11_8WAY)
|
||||||
|
|
||||||
#if defined(X11_4WAY)
|
void x11_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_x11_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(X11_4WAY)
|
||||||
|
|
||||||
void x11_4way_hash( void *state, const void *input );
|
void x11_4way_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x11_4way_ctx();
|
void init_x11_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void x11_hash( void *state, const void *input );
|
void x11_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x11( struct work *work, uint32_t max_nonce,
|
int scanhash_x11( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x11_ctx();
|
void init_x11_ctx();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
@@ -1,11 +1,7 @@
|
|||||||
#include "cpuminer-config.h"
|
#include "cpuminer-config.h"
|
||||||
#include "x11gost-gate.h"
|
#include "x11gost-gate.h"
|
||||||
|
|
||||||
#if defined (X11GOST_4WAY)
|
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "algo/blake/blake-hash-4way.h"
|
#include "algo/blake/blake-hash-4way.h"
|
||||||
#include "algo/bmw/bmw-hash-4way.h"
|
#include "algo/bmw/bmw-hash-4way.h"
|
||||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||||
@@ -14,18 +10,269 @@
|
|||||||
#include "algo/keccak/keccak-hash-4way.h"
|
#include "algo/keccak/keccak-hash-4way.h"
|
||||||
#include "algo/gost/sph_gost.h"
|
#include "algo/gost/sph_gost.h"
|
||||||
#include "algo/luffa/luffa-hash-2way.h"
|
#include "algo/luffa/luffa-hash-2way.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
#include "algo/shavite/sph_shavite.h"
|
#include "algo/shavite/sph_shavite.h"
|
||||||
#include "algo/simd/simd-hash-2way.h"
|
#include "algo/simd/simd-hash-2way.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
|
|
||||||
|
#if defined (X11GOST_8WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
blake512_8way_context blake;
|
||||||
|
bmw512_8way_context bmw;
|
||||||
|
hashState_groestl groestl;
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
keccak512_8way_context keccak;
|
||||||
|
sph_gost512_context gost;
|
||||||
|
luffa_4way_context luffa;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_shavite512_context shavite;
|
||||||
|
simd_4way_context simd;
|
||||||
|
hashState_echo echo;
|
||||||
|
} x11gost_8way_ctx_holder;
|
||||||
|
|
||||||
|
x11gost_8way_ctx_holder x11gost_8way_ctx;
|
||||||
|
|
||||||
|
void init_x11gost_8way_ctx()
|
||||||
|
{
|
||||||
|
blake512_8way_init( &x11gost_8way_ctx.blake );
|
||||||
|
bmw512_8way_init( &x11gost_8way_ctx.bmw );
|
||||||
|
init_groestl( &x11gost_8way_ctx.groestl, 64 );
|
||||||
|
skein512_8way_init( &x11gost_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &x11gost_8way_ctx.jh );
|
||||||
|
keccak512_8way_init( &x11gost_8way_ctx.keccak );
|
||||||
|
sph_gost512_init( &x11gost_8way_ctx.gost );
|
||||||
|
luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_shavite512_init( &x11gost_8way_ctx.shavite );
|
||||||
|
simd_4way_init( &x11gost_8way_ctx.simd, 512 );
|
||||||
|
init_echo( &x11gost_8way_ctx.echo, 512 );
|
||||||
|
}
|
||||||
|
|
||||||
|
void x11gost_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
x11gost_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &x11gost_8way_ctx, sizeof(x11gost_8way_ctx) );
|
||||||
|
|
||||||
|
blake512_8way_update( &ctx.blake, input, 80 );
|
||||||
|
blake512_8way_close( &ctx.blake, vhash );
|
||||||
|
|
||||||
|
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||||
|
bmw512_8way_close( &ctx.bmw, vhash );
|
||||||
|
|
||||||
|
// Serial
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||||
|
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||||
|
sizeof(hashState_groestl) );
|
||||||
|
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||||
|
|
||||||
|
// 4way
|
||||||
|
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7 );
|
||||||
|
|
||||||
|
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
|
||||||
|
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||||
|
keccak512_8way_close( &ctx.keccak, vhash );
|
||||||
|
|
||||||
|
// Serial
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
sph_gost512( &ctx.gost, hash0, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash0 );
|
||||||
|
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||||
|
sph_gost512( &ctx.gost, hash1, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash1 );
|
||||||
|
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||||
|
sph_gost512( &ctx.gost, hash2, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash2 );
|
||||||
|
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||||
|
sph_gost512( &ctx.gost, hash3, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash3 );
|
||||||
|
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||||
|
sph_gost512( &ctx.gost, hash4, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash4 );
|
||||||
|
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||||
|
sph_gost512( &ctx.gost, hash5, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash5 );
|
||||||
|
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||||
|
sph_gost512( &ctx.gost, hash6, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash6 );
|
||||||
|
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||||
|
sph_gost512( &ctx.gost, hash7, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash7 );
|
||||||
|
|
||||||
|
|
||||||
|
// Luffa + Cube
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
luffa_4way_init( &ctx.luffa, 512 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||||
|
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||||
|
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||||
|
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||||
|
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||||
|
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||||
|
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||||
|
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||||
|
sizeof(sph_shavite512_context) );
|
||||||
|
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||||
|
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||||
|
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
simd_4way_init( &ctx.simd, 512 );
|
||||||
|
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence *) hash0, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
memcpy( state, hash0, 32 );
|
||||||
|
memcpy( state+ 32, hash1, 32 );
|
||||||
|
memcpy( state+ 64, hash2, 32 );
|
||||||
|
memcpy( state+ 96, hash3, 32 );
|
||||||
|
memcpy( state+128, hash4, 32 );
|
||||||
|
memcpy( state+160, hash5, 32 );
|
||||||
|
memcpy( state+192, hash6, 32 );
|
||||||
|
memcpy( state+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
uint32_t n = pdata[19];
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
|
||||||
|
max_nonce -= 8;
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
x11gost_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( ( hash+(i<<3) )[7] < Htarg
|
||||||
|
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined (X11GOST_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
blake512_4way_context blake;
|
blake512_4way_context blake;
|
||||||
bmw512_4way_context bmw;
|
bmw512_4way_context bmw;
|
||||||
hashState_groestl groestl;
|
hashState_groestl groestl;
|
||||||
skein512_4way_context skein;
|
skein512_4way_context skein;
|
||||||
jh512_4way_context jh;
|
jh512_4way_context jh;
|
||||||
keccak512_4way_context keccak;
|
keccak512_4way_context keccak;
|
||||||
sph_gost512_context gost;
|
sph_gost512_context gost;
|
||||||
luffa_2way_context luffa;
|
luffa_2way_context luffa;
|
||||||
cubehashParam cube;
|
cubehashParam cube;
|
||||||
@@ -76,10 +323,10 @@ void x11gost_4way_hash( void *state, const void *input )
|
|||||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||||
sizeof(hashState_groestl) );
|
sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||||
sizeof(hashState_groestl) );
|
sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||||
sizeof(hashState_groestl) );
|
sizeof(hashState_groestl) );
|
||||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||||
|
|
||||||
@@ -175,7 +422,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
|||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
uint32_t n = pdata[19];
|
uint32_t n = pdata[19];
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
int thr_id = mythr->id;
|
||||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||||
@@ -185,7 +432,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
|||||||
|
|
||||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||||
|
|
||||||
for (int m=0; m < 6; m++)
|
for (int m=0; m < 6; m++)
|
||||||
if (Htarg <= htmax[m])
|
if (Htarg <= htmax[m])
|
||||||
{
|
{
|
||||||
uint32_t mask = masks[m];
|
uint32_t mask = masks[m];
|
||||||
|
@@ -2,7 +2,11 @@
|
|||||||
|
|
||||||
bool register_x11gost_algo( algo_gate_t* gate )
|
bool register_x11gost_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined (X11GOST_4WAY)
|
#if defined (X11GOST_8WAY)
|
||||||
|
init_x11gost_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_x11gost_8way;
|
||||||
|
gate->hash = (void*)&x11gost_8way_hash;
|
||||||
|
#elif defined (X11GOST_4WAY)
|
||||||
init_x11gost_4way_ctx();
|
init_x11gost_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_x11gost_4way;
|
gate->scanhash = (void*)&scanhash_x11gost_4way;
|
||||||
gate->hash = (void*)&x11gost_4way_hash;
|
gate->hash = (void*)&x11gost_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_x11gost;
|
gate->scanhash = (void*)&scanhash_x11gost;
|
||||||
gate->hash = (void*)&x11gost_hash;
|
gate->hash = (void*)&x11gost_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,29 +4,36 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define X11GOST_4WAY
|
#define X11GOST_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define X11GOST_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_x11gost_algo( algo_gate_t* gate );
|
bool register_x11gost_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(X11GOST_4WAY)
|
#if defined(X11GOST_8WAY)
|
||||||
|
|
||||||
|
void x11gost_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_x11gost_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(X11GOST_4WAY)
|
||||||
|
|
||||||
void x11gost_4way_hash( void *state, const void *input );
|
void x11gost_4way_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x11gost_4way_ctx();
|
void init_x11gost_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void x11gost_hash( void *state, const void *input );
|
void x11gost_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_x11gost( struct work *work, uint32_t max_nonce,
|
int scanhash_x11gost( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_x11gost_ctx();
|
void init_x11gost_ctx();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
@@ -1,7 +1,4 @@
|
|||||||
#include "phi1612-gate.h"
|
#include "phi1612-gate.h"
|
||||||
|
|
||||||
#if defined(PHI1612_4WAY)
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -9,10 +6,193 @@
|
|||||||
#include "algo/skein/skein-hash-4way.h"
|
#include "algo/skein/skein-hash-4way.h"
|
||||||
#include "algo/jh/jh-hash-4way.h"
|
#include "algo/jh/jh-hash-4way.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
#include "algo/fugue/sph_fugue.h"
|
#include "algo/fugue/sph_fugue.h"
|
||||||
#include "algo/gost/sph_gost.h"
|
#include "algo/gost/sph_gost.h"
|
||||||
#include "algo/echo/aes_ni/hash_api.h"
|
#include "algo/echo/aes_ni/hash_api.h"
|
||||||
|
|
||||||
|
#if defined(PHI1612_8WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
skein512_8way_context skein;
|
||||||
|
jh512_8way_context jh;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
sph_gost512_context gost;
|
||||||
|
hashState_echo echo;
|
||||||
|
} phi1612_8way_ctx_holder;
|
||||||
|
|
||||||
|
phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
void init_phi1612_8way_ctx()
|
||||||
|
{
|
||||||
|
skein512_8way_init( &phi1612_8way_ctx.skein );
|
||||||
|
jh512_8way_init( &phi1612_8way_ctx.jh );
|
||||||
|
cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_fugue512_init( &phi1612_8way_ctx.fugue );
|
||||||
|
sph_gost512_init( &phi1612_8way_ctx.gost );
|
||||||
|
init_echo( &phi1612_8way_ctx.echo, 512 );
|
||||||
|
};
|
||||||
|
|
||||||
|
void phi1612_8way_hash( void *state, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
phi1612_8way_ctx_holder ctx;
|
||||||
|
memcpy( &ctx, &phi1612_8way_ctx, sizeof(phi1612_8way_ctx) );
|
||||||
|
|
||||||
|
// Skein parallel 4way
|
||||||
|
skein512_8way_update( &ctx.skein, input, 80 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
|
||||||
|
// JH
|
||||||
|
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||||
|
jh512_8way_close( &ctx.jh, vhash );
|
||||||
|
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||||
|
vhash );
|
||||||
|
|
||||||
|
// Cubehash
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
// Fugue
|
||||||
|
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash4, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash5, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash6, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash7, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
|
||||||
|
// Gost
|
||||||
|
sph_gost512( &ctx.gost, hash0, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash0 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash1, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash1 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash2, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash2 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash3, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash3 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash4, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash4 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash5, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash5 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash6, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash6 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash7, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, hash7 );
|
||||||
|
|
||||||
|
// Echo
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
(const BitSequence *) hash0, 512 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||||
|
(const BitSequence *) hash1, 512 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||||
|
(const BitSequence *) hash2, 512 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||||
|
(const BitSequence *) hash3, 512 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||||
|
(const BitSequence *) hash4, 512 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||||
|
(const BitSequence *) hash5, 512 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||||
|
(const BitSequence *) hash6, 512 );
|
||||||
|
init_echo( &ctx.echo, 512 );
|
||||||
|
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||||
|
(const BitSequence *) hash7, 512 );
|
||||||
|
|
||||||
|
memcpy( state, hash0, 32 );
|
||||||
|
memcpy( state+ 32, hash1, 32 );
|
||||||
|
memcpy( state+ 64, hash2, 32 );
|
||||||
|
memcpy( state+ 96, hash3, 32 );
|
||||||
|
memcpy( state+128, hash4, 32 );
|
||||||
|
memcpy( state+160, hash5, 32 );
|
||||||
|
memcpy( state+192, hash6, 32 );
|
||||||
|
memcpy( state+224, hash7, 32 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
|
||||||
|
if ( opt_benchmark )
|
||||||
|
( (uint32_t*)ptarget )[7] = 0x0cff;
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
|
||||||
|
do {
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||||
|
|
||||||
|
phi1612_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( (hash+(i<<3))[7] <= Htarg )
|
||||||
|
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n += 8;
|
||||||
|
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(PHI1612_4WAY)
|
||||||
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
skein512_4way_context skein;
|
skein512_4way_context skein;
|
||||||
jh512_4way_context jh;
|
jh512_4way_context jh;
|
||||||
|
@@ -2,7 +2,11 @@
|
|||||||
|
|
||||||
bool register_phi1612_algo( algo_gate_t* gate )
|
bool register_phi1612_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
#if defined(PHI1612_4WAY)
|
#if defined(PHI1612_8WAY)
|
||||||
|
init_phi1612_8way_ctx();
|
||||||
|
gate->scanhash = (void*)&scanhash_phi1612_8way;
|
||||||
|
gate->hash = (void*)&phi1612_8way_hash;
|
||||||
|
#elif defined(PHI1612_4WAY)
|
||||||
init_phi1612_4way_ctx();
|
init_phi1612_4way_ctx();
|
||||||
gate->scanhash = (void*)&scanhash_phi1612_4way;
|
gate->scanhash = (void*)&scanhash_phi1612_4way;
|
||||||
gate->hash = (void*)&phi1612_4way_hash;
|
gate->hash = (void*)&phi1612_4way_hash;
|
||||||
@@ -11,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
|
|||||||
gate->scanhash = (void*)&scanhash_phi1612;
|
gate->scanhash = (void*)&scanhash_phi1612;
|
||||||
gate->hash = (void*)&phi1612_hash;
|
gate->hash = (void*)&phi1612_hash;
|
||||||
#endif
|
#endif
|
||||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -4,29 +4,35 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__) && defined(__AES__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define PHI1612_4WAY
|
#define PHI1612_8WAY 1
|
||||||
|
#elif defined(__AVX2__) && defined(__AES__)
|
||||||
|
#define PHI1612_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_phi1612_algo( algo_gate_t* gate );
|
bool register_phi1612_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(PHI1612_4WAY)
|
#if defined(PHI1612_8WAY)
|
||||||
|
|
||||||
|
void phi1612_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
void init_phi1612_8way_ctx();
|
||||||
|
|
||||||
|
#elif defined(PHI1612_4WAY)
|
||||||
|
|
||||||
void phi1612_4way_hash( void *state, const void *input );
|
void phi1612_4way_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_phi1612_4way_ctx();
|
void init_phi1612_4way_ctx();
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
|
||||||
void phi1612_hash( void *state, const void *input );
|
void phi1612_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_phi1612( struct work *work, uint32_t max_nonce,
|
int scanhash_phi1612( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
void init_phi1612_ctx();
|
void init_phi1612_ctx();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
@@ -1,7 +1,4 @@
|
|||||||
#include "skunk-gate.h"
|
#include "skunk-gate.h"
|
||||||
|
|
||||||
#if defined(SKUNK_4WAY)
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@@ -10,6 +7,146 @@
|
|||||||
#include "algo/gost/sph_gost.h"
|
#include "algo/gost/sph_gost.h"
|
||||||
#include "algo/fugue/sph_fugue.h"
|
#include "algo/fugue/sph_fugue.h"
|
||||||
#include "algo/cubehash/cubehash_sse2.h"
|
#include "algo/cubehash/cubehash_sse2.h"
|
||||||
|
#include "algo/cubehash/cube-hash-2way.h"
|
||||||
|
|
||||||
|
#if defined(SKUNK_8WAY)
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
skein512_8way_context skein;
|
||||||
|
cube_4way_context cube;
|
||||||
|
sph_fugue512_context fugue;
|
||||||
|
sph_gost512_context gost;
|
||||||
|
} skunk_8way_ctx_holder;
|
||||||
|
|
||||||
|
static __thread skunk_8way_ctx_holder skunk_8way_ctx;
|
||||||
|
|
||||||
|
void skunk_8way_hash( void *output, const void *input )
|
||||||
|
{
|
||||||
|
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||||
|
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||||
|
|
||||||
|
skunk_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||||
|
memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) );
|
||||||
|
|
||||||
|
skein512_8way_update( &ctx.skein, input, 80 );
|
||||||
|
skein512_8way_close( &ctx.skein, vhash );
|
||||||
|
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||||
|
hash7, vhash, 512 );
|
||||||
|
|
||||||
|
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||||
|
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||||
|
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||||
|
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||||
|
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||||
|
|
||||||
|
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||||
|
sph_fugue512( &ctx.fugue, hash4, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash5, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash6, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||||
|
sph_fugue512_init( &ctx.fugue );
|
||||||
|
sph_fugue512( &ctx.fugue, hash7, 64 );
|
||||||
|
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||||
|
|
||||||
|
sph_gost512( &ctx.gost, hash0, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash1, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output+ 32 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash2, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output+ 64 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash3, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output+ 96 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash4, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output+128 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash5, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output+160 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash6, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output+192 );
|
||||||
|
sph_gost512_init( &ctx.gost );
|
||||||
|
sph_gost512( &ctx.gost, hash7, 64 );
|
||||||
|
sph_gost512_close( &ctx.gost, output+224 );
|
||||||
|
}
|
||||||
|
|
||||||
|
int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr )
|
||||||
|
{
|
||||||
|
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||||
|
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
uint32_t n = first_nonce;
|
||||||
|
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
int thr_id = mythr->id;
|
||||||
|
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||||
|
|
||||||
|
if ( opt_benchmark )
|
||||||
|
((uint32_t*)ptarget)[7] = 0x0cff;
|
||||||
|
|
||||||
|
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||||
|
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||||
|
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||||
|
|
||||||
|
skunk_8way_hash( hash, vdata );
|
||||||
|
pdata[19] = n;
|
||||||
|
|
||||||
|
for ( int i = 0; i < 8; i++ )
|
||||||
|
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
|
||||||
|
if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
|
||||||
|
{
|
||||||
|
pdata[19] = n+i;
|
||||||
|
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||||
|
}
|
||||||
|
n +=8;
|
||||||
|
} while ( likely( ( n < max_nonce-8 ) && !(*restart) ) );
|
||||||
|
|
||||||
|
*hashes_done = n - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool skunk_8way_thread_init()
|
||||||
|
{
|
||||||
|
skein512_8way_init( &skunk_8way_ctx.skein );
|
||||||
|
cube_4way_init( &skunk_8way_ctx.cube, 512, 16, 32 );
|
||||||
|
sph_fugue512_init( &skunk_8way_ctx.fugue );
|
||||||
|
sph_gost512_init( &skunk_8way_ctx.gost );
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(SKUNK_4WAY)
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
skein512_4way_context skein;
|
skein512_4way_context skein;
|
||||||
|
@@ -2,12 +2,15 @@
|
|||||||
|
|
||||||
bool register_skunk_algo( algo_gate_t* gate )
|
bool register_skunk_algo( algo_gate_t* gate )
|
||||||
{
|
{
|
||||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||||
#if defined (SKUNK_4WAY)
|
#if defined (SKUNK_8WAY)
|
||||||
|
gate->miner_thread_init = (void*)&skunk_8way_thread_init;
|
||||||
|
gate->scanhash = (void*)&scanhash_skunk_8way;
|
||||||
|
gate->hash = (void*)&skunk_8way_hash;
|
||||||
|
#elif defined (SKUNK_4WAY)
|
||||||
gate->miner_thread_init = (void*)&skunk_4way_thread_init;
|
gate->miner_thread_init = (void*)&skunk_4way_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_skunk_4way;
|
gate->scanhash = (void*)&scanhash_skunk_4way;
|
||||||
gate->hash = (void*)&skunk_4way_hash;
|
gate->hash = (void*)&skunk_4way_hash;
|
||||||
// init_skunk_4way_ctx();
|
|
||||||
#else
|
#else
|
||||||
gate->miner_thread_init = (void*)&skunk_thread_init;
|
gate->miner_thread_init = (void*)&skunk_thread_init;
|
||||||
gate->scanhash = (void*)&scanhash_skunk;
|
gate->scanhash = (void*)&scanhash_skunk;
|
||||||
|
@@ -4,29 +4,33 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
#define SKUNK_4WAY
|
#define SKUNK_8WAY 1
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
#define SKUNK_4WAY 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool register_skunk_algo( algo_gate_t* gate );
|
bool register_skunk_algo( algo_gate_t* gate );
|
||||||
|
|
||||||
#if defined(SKUNK_4WAY)
|
#if defined(SKUNK_8WAY)
|
||||||
|
|
||||||
|
void skunk_8way_hash( void *state, const void *input );
|
||||||
|
int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
|
||||||
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
bool skunk_8way_thread_init();
|
||||||
|
|
||||||
|
#elif defined(SKUNK_4WAY)
|
||||||
|
|
||||||
void skunk_4way_hash( void *state, const void *input );
|
void skunk_4way_hash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
|
int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
bool skunk_4way_thread_init();
|
bool skunk_4way_thread_init();
|
||||||
//void init_skunk_4way_ctx();
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void skunkhash( void *state, const void *input );
|
void skunkhash( void *state, const void *input );
|
||||||
|
|
||||||
int scanhash_skunk( struct work *work, uint32_t max_nonce,
|
int scanhash_skunk( struct work *work, uint32_t max_nonce,
|
||||||
uint64_t *hashes_done, struct thr_info *mythr );
|
uint64_t *hashes_done, struct thr_info *mythr );
|
||||||
|
|
||||||
bool skunk_thread_init();
|
bool skunk_thread_init();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -127,6 +127,7 @@ void x17_4way_hash( void *state, const void *input )
|
|||||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||||
|
|
||||||
|
|
||||||
// 11 Echo serial
|
// 11 Echo serial
|
||||||
init_echo( &ctx.echo, 512 );
|
init_echo( &ctx.echo, 512 );
|
||||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||||
|
@@ -29,7 +29,7 @@ rm -f config.status
|
|||||||
CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl
|
CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl
|
||||||
make -j 16
|
make -j 16
|
||||||
strip -s cpuminer.exe
|
strip -s cpuminer.exe
|
||||||
mv cpuminer.exe cpuminer-aes-avx.exe
|
mv cpuminer.exe cpuminer-avx.exe
|
||||||
strip -s cpuminer
|
strip -s cpuminer
|
||||||
mv cpuminer cpuminer-aes-avx
|
mv cpuminer cpuminer-aes-avx
|
||||||
|
|
||||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
|||||||
#! /bin/sh
|
#! /bin/sh
|
||||||
# Guess values for system-dependent variables and create Makefiles.
|
# Guess values for system-dependent variables and create Makefiles.
|
||||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.1.
|
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
|||||||
# Identity of this package.
|
# Identity of this package.
|
||||||
PACKAGE_NAME='cpuminer-opt'
|
PACKAGE_NAME='cpuminer-opt'
|
||||||
PACKAGE_TARNAME='cpuminer-opt'
|
PACKAGE_TARNAME='cpuminer-opt'
|
||||||
PACKAGE_VERSION='3.10.1'
|
PACKAGE_VERSION='3.10.2'
|
||||||
PACKAGE_STRING='cpuminer-opt 3.10.1'
|
PACKAGE_STRING='cpuminer-opt 3.10.2'
|
||||||
PACKAGE_BUGREPORT=''
|
PACKAGE_BUGREPORT=''
|
||||||
PACKAGE_URL=''
|
PACKAGE_URL=''
|
||||||
|
|
||||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
|||||||
# Omit some internal or obsolete options to make the list less imposing.
|
# Omit some internal or obsolete options to make the list less imposing.
|
||||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||||
cat <<_ACEOF
|
cat <<_ACEOF
|
||||||
\`configure' configures cpuminer-opt 3.10.1 to adapt to many kinds of systems.
|
\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
|
||||||
|
|
||||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||||
|
|
||||||
@@ -1404,7 +1404,7 @@ fi
|
|||||||
|
|
||||||
if test -n "$ac_init_help"; then
|
if test -n "$ac_init_help"; then
|
||||||
case $ac_init_help in
|
case $ac_init_help in
|
||||||
short | recursive ) echo "Configuration of cpuminer-opt 3.10.1:";;
|
short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
|
||||||
esac
|
esac
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
|
|
||||||
@@ -1509,7 +1509,7 @@ fi
|
|||||||
test -n "$ac_init_help" && exit $ac_status
|
test -n "$ac_init_help" && exit $ac_status
|
||||||
if $ac_init_version; then
|
if $ac_init_version; then
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
cpuminer-opt configure 3.10.1
|
cpuminer-opt configure 3.10.2
|
||||||
generated by GNU Autoconf 2.69
|
generated by GNU Autoconf 2.69
|
||||||
|
|
||||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
|||||||
This file contains any messages produced by compilers while
|
This file contains any messages produced by compilers while
|
||||||
running configure, to aid debugging if configure makes a mistake.
|
running configure, to aid debugging if configure makes a mistake.
|
||||||
|
|
||||||
It was created by cpuminer-opt $as_me 3.10.1, which was
|
It was created by cpuminer-opt $as_me 3.10.2, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
$ $0 $@
|
$ $0 $@
|
||||||
@@ -2993,7 +2993,7 @@ fi
|
|||||||
|
|
||||||
# Define the identity of the package.
|
# Define the identity of the package.
|
||||||
PACKAGE='cpuminer-opt'
|
PACKAGE='cpuminer-opt'
|
||||||
VERSION='3.10.1'
|
VERSION='3.10.2'
|
||||||
|
|
||||||
|
|
||||||
cat >>confdefs.h <<_ACEOF
|
cat >>confdefs.h <<_ACEOF
|
||||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
|||||||
# report actual input values of CONFIG_FILES etc. instead of their
|
# report actual input values of CONFIG_FILES etc. instead of their
|
||||||
# values after options handling.
|
# values after options handling.
|
||||||
ac_log="
|
ac_log="
|
||||||
This file was extended by cpuminer-opt $as_me 3.10.1, which was
|
This file was extended by cpuminer-opt $as_me 3.10.2, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
CONFIG_FILES = $CONFIG_FILES
|
CONFIG_FILES = $CONFIG_FILES
|
||||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
|||||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||||
ac_cs_version="\\
|
ac_cs_version="\\
|
||||||
cpuminer-opt config.status 3.10.1
|
cpuminer-opt config.status 3.10.2
|
||||||
configured by $0, generated by GNU Autoconf 2.69,
|
configured by $0, generated by GNU Autoconf 2.69,
|
||||||
with options \\"\$ac_cs_config\\"
|
with options \\"\$ac_cs_config\\"
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
AC_INIT([cpuminer-opt], [3.10.1])
|
AC_INIT([cpuminer-opt], [3.10.2])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
@@ -3327,7 +3327,7 @@ static void show_credits()
|
|||||||
{
|
{
|
||||||
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
|
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
|
||||||
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
|
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
|
||||||
printf(" with AES_NI and AVX2, AVX512 and SHA extensions.\n");
|
printf(" with AES_NI, AVX2, AVX512 and SHA extensions.\n");
|
||||||
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
|
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3343,7 +3343,7 @@ bool check_cpu_capability ()
|
|||||||
bool cpu_has_avx512 = has_avx512();
|
bool cpu_has_avx512 = has_avx512();
|
||||||
bool cpu_has_vaes = has_vaes();
|
bool cpu_has_vaes = has_vaes();
|
||||||
bool sw_has_aes = false;
|
bool sw_has_aes = false;
|
||||||
bool sw_has_sse2 = false;
|
bool sw_has_sse2 = false;
|
||||||
bool sw_has_sse42 = false;
|
bool sw_has_sse42 = false;
|
||||||
bool sw_has_avx = false;
|
bool sw_has_avx = false;
|
||||||
bool sw_has_avx2 = false;
|
bool sw_has_avx2 = false;
|
||||||
@@ -3412,8 +3412,8 @@ bool check_cpu_capability ()
|
|||||||
|
|
||||||
printf("CPU features:");
|
printf("CPU features:");
|
||||||
if ( cpu_has_vaes ) printf( " VAES" );
|
if ( cpu_has_vaes ) printf( " VAES" );
|
||||||
if ( cpu_has_sha ) printf( " SHA" );
|
|
||||||
else if ( cpu_has_aes ) printf( " AES" );
|
else if ( cpu_has_aes ) printf( " AES" );
|
||||||
|
if ( cpu_has_sha ) printf( " SHA" );
|
||||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||||
else if ( cpu_has_avx2 ) printf( " AVX2" );
|
else if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||||
else if ( cpu_has_avx ) printf( " AVX" );
|
else if ( cpu_has_avx ) printf( " AVX" );
|
||||||
|
@@ -1528,6 +1528,58 @@ static inline void intrlv_8x64( void *dst, const void *src0,
|
|||||||
d[63] = _mm_unpackhi_epi64( s6[7], s7[7] );
|
d[63] = _mm_unpackhi_epi64( s6[7], s7[7] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void intrlv_8x64_512( void *dst, const void *src0,
|
||||||
|
const void *src1, const void *src2, const void *src3,
|
||||||
|
const void *src4, const void *src5, const void *src6,
|
||||||
|
const void *src7 )
|
||||||
|
{
|
||||||
|
__m128i *d = (__m128i*)dst;
|
||||||
|
const __m128i *s0 = (const __m128i*)src0;
|
||||||
|
const __m128i *s1 = (const __m128i*)src1;
|
||||||
|
const __m128i *s2 = (const __m128i*)src2;
|
||||||
|
const __m128i *s3 = (const __m128i*)src3;
|
||||||
|
const __m128i *s4 = (const __m128i*)src4;
|
||||||
|
const __m128i *s5 = (const __m128i*)src5;
|
||||||
|
const __m128i *s6 = (const __m128i*)src6;
|
||||||
|
const __m128i *s7 = (const __m128i*)src7;
|
||||||
|
|
||||||
|
d[ 0] = _mm_unpacklo_epi64( s0[0], s1[0] );
|
||||||
|
d[ 1] = _mm_unpacklo_epi64( s2[0], s3[0] );
|
||||||
|
d[ 2] = _mm_unpacklo_epi64( s4[0], s5[0] );
|
||||||
|
d[ 3] = _mm_unpacklo_epi64( s6[0], s7[0] );
|
||||||
|
d[ 4] = _mm_unpackhi_epi64( s0[0], s1[0] );
|
||||||
|
d[ 5] = _mm_unpackhi_epi64( s2[0], s3[0] );
|
||||||
|
d[ 6] = _mm_unpackhi_epi64( s4[0], s5[0] );
|
||||||
|
d[ 7] = _mm_unpackhi_epi64( s6[0], s7[0] );
|
||||||
|
|
||||||
|
d[ 8] = _mm_unpacklo_epi64( s0[1], s1[1] );
|
||||||
|
d[ 9] = _mm_unpacklo_epi64( s2[1], s3[1] );
|
||||||
|
d[10] = _mm_unpacklo_epi64( s4[1], s5[1] );
|
||||||
|
d[11] = _mm_unpacklo_epi64( s6[1], s7[1] );
|
||||||
|
d[12] = _mm_unpackhi_epi64( s0[1], s1[1] );
|
||||||
|
d[13] = _mm_unpackhi_epi64( s2[1], s3[1] );
|
||||||
|
d[14] = _mm_unpackhi_epi64( s4[1], s5[1] );
|
||||||
|
d[15] = _mm_unpackhi_epi64( s6[1], s7[1] );
|
||||||
|
|
||||||
|
d[16] = _mm_unpacklo_epi64( s0[2], s1[2] );
|
||||||
|
d[17] = _mm_unpacklo_epi64( s2[2], s3[2] );
|
||||||
|
d[18] = _mm_unpacklo_epi64( s4[2], s5[2] );
|
||||||
|
d[19] = _mm_unpacklo_epi64( s6[2], s7[2] );
|
||||||
|
d[20] = _mm_unpackhi_epi64( s0[2], s1[2] );
|
||||||
|
d[21] = _mm_unpackhi_epi64( s2[2], s3[2] );
|
||||||
|
d[22] = _mm_unpackhi_epi64( s4[2], s5[2] );
|
||||||
|
d[23] = _mm_unpackhi_epi64( s6[2], s7[2] );
|
||||||
|
|
||||||
|
d[24] = _mm_unpacklo_epi64( s0[3], s1[3] );
|
||||||
|
d[25] = _mm_unpacklo_epi64( s2[3], s3[3] );
|
||||||
|
d[26] = _mm_unpacklo_epi64( s4[3], s5[3] );
|
||||||
|
d[27] = _mm_unpacklo_epi64( s6[3], s7[3] );
|
||||||
|
d[28] = _mm_unpackhi_epi64( s0[3], s1[3] );
|
||||||
|
d[29] = _mm_unpackhi_epi64( s2[3], s3[3] );
|
||||||
|
d[30] = _mm_unpackhi_epi64( s4[3], s5[3] );
|
||||||
|
d[31] = _mm_unpackhi_epi64( s6[3], s7[3] );
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#define ILEAVE_8x64( i ) do \
|
#define ILEAVE_8x64( i ) do \
|
||||||
{ \
|
{ \
|
||||||
@@ -1656,6 +1708,57 @@ static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2,
|
|||||||
d7[7] = _mm_unpackhi_epi64( s[59], s[63] );
|
d7[7] = _mm_unpackhi_epi64( s[59], s[63] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2,
|
||||||
|
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
|
||||||
|
const void *src )
|
||||||
|
{
|
||||||
|
__m128i *d0 = (__m128i*)dst0;
|
||||||
|
__m128i *d1 = (__m128i*)dst1;
|
||||||
|
__m128i *d2 = (__m128i*)dst2;
|
||||||
|
__m128i *d3 = (__m128i*)dst3;
|
||||||
|
__m128i *d4 = (__m128i*)dst4;
|
||||||
|
__m128i *d5 = (__m128i*)dst5;
|
||||||
|
__m128i *d6 = (__m128i*)dst6;
|
||||||
|
__m128i *d7 = (__m128i*)dst7;
|
||||||
|
const __m128i* s = (const __m128i*)src;
|
||||||
|
|
||||||
|
d0[0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
|
||||||
|
d1[0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
|
||||||
|
d2[0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
|
||||||
|
d3[0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
|
||||||
|
d4[0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
|
||||||
|
d5[0] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
|
||||||
|
d6[0] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
|
||||||
|
d7[0] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
|
||||||
|
|
||||||
|
d0[1] = _mm_unpacklo_epi64( s[ 8], s[12] );
|
||||||
|
d1[1] = _mm_unpackhi_epi64( s[ 8], s[12] );
|
||||||
|
d2[1] = _mm_unpacklo_epi64( s[ 9], s[13] );
|
||||||
|
d3[1] = _mm_unpackhi_epi64( s[ 9], s[13] );
|
||||||
|
d4[1] = _mm_unpacklo_epi64( s[10], s[14] );
|
||||||
|
d5[1] = _mm_unpackhi_epi64( s[10], s[14] );
|
||||||
|
d6[1] = _mm_unpacklo_epi64( s[11], s[15] );
|
||||||
|
d7[1] = _mm_unpackhi_epi64( s[11], s[15] );
|
||||||
|
|
||||||
|
d0[2] = _mm_unpacklo_epi64( s[16], s[20] );
|
||||||
|
d1[2] = _mm_unpackhi_epi64( s[16], s[20] );
|
||||||
|
d2[2] = _mm_unpacklo_epi64( s[17], s[21] );
|
||||||
|
d3[2] = _mm_unpackhi_epi64( s[17], s[21] );
|
||||||
|
d4[2] = _mm_unpacklo_epi64( s[18], s[22] );
|
||||||
|
d5[2] = _mm_unpackhi_epi64( s[18], s[22] );
|
||||||
|
d6[2] = _mm_unpacklo_epi64( s[19], s[23] );
|
||||||
|
d7[2] = _mm_unpackhi_epi64( s[19], s[23] );
|
||||||
|
|
||||||
|
d0[3] = _mm_unpacklo_epi64( s[24], s[28] );
|
||||||
|
d1[3] = _mm_unpackhi_epi64( s[24], s[28] );
|
||||||
|
d2[3] = _mm_unpacklo_epi64( s[25], s[29] );
|
||||||
|
d3[3] = _mm_unpackhi_epi64( s[25], s[29] );
|
||||||
|
d4[3] = _mm_unpacklo_epi64( s[26], s[30] );
|
||||||
|
d5[3] = _mm_unpackhi_epi64( s[26], s[30] );
|
||||||
|
d6[3] = _mm_unpacklo_epi64( s[27], s[31] );
|
||||||
|
d7[3] = _mm_unpackhi_epi64( s[27], s[31] );
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#define DLEAVE_8x64( i ) do \
|
#define DLEAVE_8x64( i ) do \
|
||||||
{ \
|
{ \
|
||||||
@@ -1910,6 +2013,32 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
|
||||||
|
{
|
||||||
|
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||||
|
__m128i s0 = casti_m128i( src,0 );
|
||||||
|
__m128i s1 = casti_m128i( src,1 );
|
||||||
|
__m128i s2 = casti_m128i( src,2 );
|
||||||
|
__m128i s3 = casti_m128i( src,3 );
|
||||||
|
__m128i s4 = casti_m128i( src,4 );
|
||||||
|
|
||||||
|
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||||
|
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||||
|
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||||
|
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||||
|
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||||
|
|
||||||
|
casti_m512i( d, 0 ) = _mm512_broadcast_i64x2( s0 );
|
||||||
|
casti_m512i( d, 1 ) = _mm512_broadcast_i64x2( s1 );
|
||||||
|
casti_m512i( d, 2 ) = _mm512_broadcast_i64x2( s2 );
|
||||||
|
casti_m512i( d, 3 ) = _mm512_broadcast_i64x2( s3 );
|
||||||
|
casti_m512i( d, 4 ) = _mm512_broadcast_i64x2( s4 );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// 2x256 (AVX512)
|
// 2x256 (AVX512)
|
||||||
|
|
||||||
#if defined (__AVX__)
|
#if defined (__AVX__)
|
||||||
@@ -1946,6 +2075,9 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
|
|||||||
d0[3] = s[6]; d1[3] = s[7];
|
d0[3] = s[6]; d1[3] = s[7];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif // AVX
|
#endif // AVX
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
|
@@ -243,7 +243,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
|||||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
#define mm128_ror_64 _mm_ror_epi64
|
#define mm128_ror_64 _mm_ror_epi64
|
||||||
|
@@ -454,6 +454,13 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
|||||||
// Swap 32 bit elements in each 64 bit lane
|
// Swap 32 bit elements in each 64 bit lane
|
||||||
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
#define mm256_rol1x16_64( v ) _mm256_rol_epi64( v, 16 )
|
||||||
|
#define mm256_ror1x16_64( v ) _mm256_ror_epi64( v, 16 )
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define mm256_ror1x16_64( v ) \
|
#define mm256_ror1x16_64( v ) \
|
||||||
_mm256_shuffle_epi8( v, \
|
_mm256_shuffle_epi8( v, \
|
||||||
m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
|
m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
|
||||||
@@ -463,6 +470,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
|||||||
_mm256_shuffle_epi8( v, \
|
_mm256_shuffle_epi8( v, \
|
||||||
m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
|
m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
|
||||||
0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
|
0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
|
||||||
|
#endif
|
||||||
|
|
||||||
#define mm256_ror1x8_64( v ) \
|
#define mm256_ror1x8_64( v ) \
|
||||||
_mm256_shuffle_epi8( v, \
|
_mm256_shuffle_epi8( v, \
|
||||||
@@ -486,10 +494,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
|||||||
|
|
||||||
|
|
||||||
// Swap 16 bit elements in each 32 bit lane
|
// Swap 16 bit elements in each 32 bit lane
|
||||||
|
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
#define mm256_swap16_32( v ) _mm256_rol_epi32( v, 16 )
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define mm256_swap16_32( v ) \
|
#define mm256_swap16_32( v ) \
|
||||||
_mm256_shuffle_epi8( v, \
|
_mm256_shuffle_epi8( v, \
|
||||||
m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
|
m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
|
||||||
0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
|
0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Swap bytes in vector elements, endian bswap.
|
// Swap bytes in vector elements, endian bswap.
|
||||||
|
@@ -13,20 +13,31 @@
|
|||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
// AVX512 intrinsics have a few peculiarities with permutes and shuffles
|
// AVX512 intrinsics have a few changes from previous conventions.
|
||||||
// that are inconsistent with previous AVX2 implementations.
|
//
|
||||||
|
// Some instructions like cmp and blend use the mask regsiters now instead
|
||||||
|
// a vector mask.
|
||||||
|
//
|
||||||
|
// The new rotate instructions require the count to be only an 8 bit
|
||||||
|
// immediate value. The documentation is the same as for shift and
|
||||||
|
// it allows variables. Suspect a compiler issue but it still happens
|
||||||
|
// in GCC9.
|
||||||
//
|
//
|
||||||
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
||||||
// usually shuffles accross all lanes.
|
// usually shuffles accross all lanes.
|
||||||
//
|
//
|
||||||
|
// Some instructions like cmp and blend use a mask regsiter now instead
|
||||||
|
// a mask vector.
|
||||||
|
//
|
||||||
// permutexvar has args reversed, index is first arg. Previously all
|
// permutexvar has args reversed, index is first arg. Previously all
|
||||||
// permutes and shuffles have the source vector first.
|
// permutes and shuffles have the index last.
|
||||||
//
|
//
|
||||||
// _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't.
|
// _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't.
|
||||||
// It also performs the same op as _mm512_shuffle_epi8.
|
// It also performs the same op as _mm512_shuffle_epi8.
|
||||||
//
|
//
|
||||||
// _mm512_shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
|
// shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
|
||||||
// doesn't cross 128 bit lane boundaries.
|
// doesn't cross 128 bit lane boundaries but is consistent with AVX2
|
||||||
|
// where shuffle_epi8 spans the entire vector.
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////
|
||||||
//
|
//
|
||||||
@@ -35,7 +46,6 @@
|
|||||||
// Other AVX512 extensions that may be required for some functions.
|
// Other AVX512 extensions that may be required for some functions.
|
||||||
// __AVX512VBMI__ __AVX512VAES__
|
// __AVX512VBMI__ __AVX512VAES__
|
||||||
//
|
//
|
||||||
// Experimental, not fully tested.
|
|
||||||
|
|
||||||
// Move integer to/from element 0 of vector.
|
// Move integer to/from element 0 of vector.
|
||||||
|
|
||||||
@@ -88,10 +98,19 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
|||||||
return mm512_concat_256( hi, lo );
|
return mm512_concat_256( hi, lo );
|
||||||
}
|
}
|
||||||
|
|
||||||
// Equivalent of set4, broadcast 256 bits in groups of four 64 bit constants
|
// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
|
||||||
// to all 256 bit lanes: {i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0}.
|
#define m512_const1_256( i ) _mm512_broadcast_i64x4( i )
|
||||||
|
#define m512_const1_128( i ) _mm512_broadcast_i64x2( i )
|
||||||
|
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||||
|
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||||
|
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||||
|
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||||
|
|
||||||
|
#define m512_const2_64( i1, i0 ) \
|
||||||
|
m512_const1_128( m128_const_64( i1, i0 ) )
|
||||||
|
|
||||||
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||||
const uint64_t i1, const uint64_t i0 )
|
const uint64_t i1, const uint64_t i0 )
|
||||||
{
|
{
|
||||||
__m256i lo = mm256_mov64_256( i0 );
|
__m256i lo = mm256_mov64_256( i0 );
|
||||||
__m128i hi = mm128_mov64_128( i2 );
|
__m128i hi = mm128_mov64_128( i2 );
|
||||||
@@ -99,25 +118,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
|||||||
_mm_insert_epi64( _mm256_castsi256_si128(
|
_mm_insert_epi64( _mm256_castsi256_si128(
|
||||||
lo ), i1, 1 ) );
|
lo ), i1, 1 ) );
|
||||||
hi = _mm_insert_epi64( hi, i3, 1 );
|
hi = _mm_insert_epi64( hi, i3, 1 );
|
||||||
return _mm512_permutex_epi64( _mm512_castsi256_si512(
|
return _mm512_broadcast_i64x4( _mm256_inserti128_si256( lo, hi, 1 ) );
|
||||||
_mm256_inserti128_si256( lo, hi, 1 ) ), 0xe4 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Broadcast 128 bits in pairs of 64 bit constants {i1. i0} to all
|
|
||||||
// 128 bit lanes.
|
|
||||||
#define m512_const2_64( i1, i0 ) \
|
|
||||||
_mm512_permutex_epi64( _mm512_castsi128_si512( \
|
|
||||||
m128_const_64( i1, i0 ) ), 0x44 )
|
|
||||||
|
|
||||||
// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
|
|
||||||
#define m512_const1_256( i ) _mm512_broadcast_i64x4( i )
|
|
||||||
#define m512_const1_128( i ) _mm512_broadcast_i64x2( i )
|
|
||||||
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
|
||||||
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
|
||||||
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
|
||||||
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Pseudo constants.
|
// Pseudo constants.
|
||||||
|
|
||||||
@@ -136,17 +139,6 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
|||||||
|
|
||||||
#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
|
#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
|
||||||
|
|
||||||
/*
|
|
||||||
// EVEX vcmpeqq returns a bit mask instead of a vector
|
|
||||||
static inline __m512i mm512_neg1_fn()
|
|
||||||
{
|
|
||||||
__m512i a;
|
|
||||||
asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
#define m512_neg1 mm512_neg1_fn()
|
|
||||||
*/
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Basic operations without SIMD equivalent
|
// Basic operations without SIMD equivalent
|
||||||
|
|
||||||
@@ -209,7 +201,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
|
|
||||||
|
|
||||||
// Horizontal vector testing
|
// Horizontal vector testing
|
||||||
// Returns bit mask
|
// Returns bit __mmask8
|
||||||
#define mm512_allbits0( a ) _mm512_cmpeq_epi64_mask( a, m512_zero )
|
#define mm512_allbits0( a ) _mm512_cmpeq_epi64_mask( a, m512_zero )
|
||||||
#define mm512_allbits1( a ) _mm512_cmpeq_epi64_mask( a, m512_neg1 )
|
#define mm512_allbits1( a ) _mm512_cmpeq_epi64_mask( a, m512_neg1 )
|
||||||
#define mm512_anybits0( a ) _mm512_cmpneq_epi64_mask( a, m512_neg1 )
|
#define mm512_anybits0( a ) _mm512_cmpneq_epi64_mask( a, m512_neg1 )
|
||||||
@@ -514,6 +506,12 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||||
|
|
||||||
// Rotate each 64 bit lane by one 16 bit element.
|
// Rotate each 64 bit lane by one 16 bit element.
|
||||||
|
#define mm512_ror1x16_64( v ) _mm512_ror_epi64( v, 16 )
|
||||||
|
#define mm512_rol1x16_64( v ) _mm512_rol_epi64( v, 16 )
|
||||||
|
#define mm512_ror1x8_64( v ) _mm512_ror_epi64( v, 8 )
|
||||||
|
#define mm512_rol1x8_64( v ) _mm512_rol_epi64( v, 8 )
|
||||||
|
|
||||||
|
/*
|
||||||
#define mm512_ror1x16_64( v ) \
|
#define mm512_ror1x16_64( v ) \
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
_mm512_permutexvar_epi16( m512_const_64( \
|
||||||
0x001c001f001e001d, 0x0018001b001a0019, \
|
0x001c001f001e001d, 0x0018001b001a0019, \
|
||||||
@@ -541,10 +539,16 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
0x2E2D2C2B2A29282F, 0x2625242322212027, \
|
0x2E2D2C2B2A29282F, 0x2625242322212027, \
|
||||||
0x1E1D1C1B1A19181F, 0x1615141312111017, \
|
0x1E1D1C1B1A19181F, 0x1615141312111017, \
|
||||||
0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
|
0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
|
||||||
|
*/
|
||||||
|
|
||||||
//
|
//
|
||||||
// Rotate elements within 32 bit lanes.
|
// Rotate elements within 32 bit lanes.
|
||||||
|
|
||||||
|
#define mm512_swap16_32( v ) _mm512_ror_epi32( v, 16 )
|
||||||
|
#define mm512_ror1x8_32( v ) _mm512_ror_epi32( v, 8 )
|
||||||
|
#define mm512_rol1x8_32( v ) _mm512_rol_epi32( v, 8 )
|
||||||
|
|
||||||
|
/*
|
||||||
#define mm512_swap16_32( v ) \
|
#define mm512_swap16_32( v ) \
|
||||||
_mm512_permutexvar_epi16( m512_const_64( \
|
_mm512_permutexvar_epi16( m512_const_64( \
|
||||||
0x001e001f001c001d, 0x001a001b00180019, \
|
0x001e001f001c001d, 0x001a001b00180019, \
|
||||||
@@ -565,6 +569,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
|||||||
0x2E2D2C2F2A29282B, 0x2625242722212023, \
|
0x2E2D2C2F2A29282B, 0x2625242722212023, \
|
||||||
0x1E1D1C1F1A19181B, 0x1615141712111013, \
|
0x1E1D1C1F1A19181B, 0x1615141712111013, \
|
||||||
0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
|
0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Rotate elements from 2 512 bit vectors in place, source arguments
|
// Rotate elements from 2 512 bit vectors in place, source arguments
|
||||||
|
@@ -62,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
|
|||||||
|
|
||||||
make clean || echo clean
|
make clean || echo clean
|
||||||
rm -f config.status
|
rm -f config.status
|
||||||
# GCC 9 doesn't include AES in core-avx2
|
# GCC 9 doesn't include AES in -march=core-avx2
|
||||||
CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
|
CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
|
||||||
make -j 16
|
make -j 16
|
||||||
strip -s cpuminer.exe
|
strip -s cpuminer.exe
|
||||||
@@ -70,7 +70,8 @@ mv cpuminer.exe release/cpuminer-avx2.exe
|
|||||||
|
|
||||||
make clean || echo clean
|
make clean || echo clean
|
||||||
rm -f config.status
|
rm -f config.status
|
||||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS
|
# -march=corei7-avx still includes aes, but just in case
|
||||||
|
CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure $CONFIGURE_ARGS
|
||||||
make -j 16
|
make -j 16
|
||||||
strip -s cpuminer.exe
|
strip -s cpuminer.exe
|
||||||
mv cpuminer.exe release/cpuminer-avx.exe
|
mv cpuminer.exe release/cpuminer-avx.exe
|
||||||
|
Reference in New Issue
Block a user