mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.10.2
This commit is contained in:
12
README.txt
12
README.txt
@@ -23,12 +23,20 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
|
||||
these CPUs. Some algos may crash the miner with an invalid instruction.
|
||||
Users are recommended to use an unoptimized miner such as cpuminer-multi.
|
||||
|
||||
More information for Intel and AMD CPU architectures and their features
|
||||
can be found on Wikipedia.
|
||||
|
||||
https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
|
||||
|
||||
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
|
||||
|
||||
|
||||
Exe name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandy-Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Sky-Kaby-Coffeelake
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell, Skylake, Coffeelake
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||
cpuminer-zen "-march=znver1" AMD Ryzen, Threadripper
|
||||
|
||||
|
@@ -25,19 +25,23 @@ Requirements
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple and Android are not supported.
|
||||
FreeBSD YMMV.
|
||||
64 bit Linux or Windows operating system. Apple, Android and Rpi are
|
||||
not supported. FreeBSD YMMV.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.10.2
|
||||
|
||||
AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
|
||||
Fixed c11 AVX2 invalid shares.
|
||||
|
||||
v3.10.1
|
||||
|
||||
AVX512 for blake2b, nist5, quark, tribus.
|
||||
|
||||
More broken lane fixes.
|
||||
|
||||
Fixed buffer overflow in skein AVX512.
|
||||
More broken lane fixes, fixed buffer overflow in skein AVX512, fixed
|
||||
quark invalid shares AVX2.
|
||||
|
||||
Only the highest ranking feature in a class is listed at startup, lower ranking
|
||||
features are available but no longer listed.
|
||||
|
@@ -127,7 +127,7 @@ typedef struct {
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( void *cc );
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
#define blake512_4way blake512_4way_update
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
@@ -136,6 +136,37 @@ void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//Blake-256 16 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_16way_small_context blake256_16way_context;
|
||||
void blake256_16way_init(void *cc);
|
||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close(void *cc, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_16way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_16way_small_context blake256r8_16way_context;
|
||||
void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
|
||||
// Blake-512 8 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
@@ -146,7 +177,7 @@ typedef struct {
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( void *cc );
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
|
@@ -680,6 +680,144 @@ do { \
|
||||
} while (0)
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blaske-256 16 way AVX512
|
||||
|
||||
#define GS_16WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
do { \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||
_mm512_xor_si512( _mm512_set1_epi32( c1 ), m0 ) ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 12 ); \
|
||||
a = _mm512_add_epi32( _mm512_add_epi32( a, b ), \
|
||||
_mm512_xor_si512( _mm512_set1_epi32( c0 ), m1 ) ); \
|
||||
d = mm512_ror_32( _mm512_xor_si512( d, a ), 8 ); \
|
||||
c = _mm512_add_epi32( c, d ); \
|
||||
b = mm512_ror_32( _mm512_xor_si512( b, c ), 7 ); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_S_16WAY(r) do { \
|
||||
GS_16WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_16WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_16WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS_16WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS_16WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS_16WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_16WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_16WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#define DECL_STATE32_16WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
|
||||
#define READ_STATE32_16WAY(state) \
|
||||
do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
H2 = (state)->H[2]; \
|
||||
H3 = (state)->H[3]; \
|
||||
H4 = (state)->H[4]; \
|
||||
H5 = (state)->H[5]; \
|
||||
H6 = (state)->H[6]; \
|
||||
H7 = (state)->H[7]; \
|
||||
T0 = (state)->T0; \
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE32_16WAY(state) \
|
||||
do { \
|
||||
(state)->H[0] = H0; \
|
||||
(state)->H[1] = H1; \
|
||||
(state)->H[2] = H2; \
|
||||
(state)->H[3] = H3; \
|
||||
(state)->H[4] = H4; \
|
||||
(state)->H[5] = H5; \
|
||||
(state)->H[6] = H6; \
|
||||
(state)->H[7] = H7; \
|
||||
(state)->T0 = T0; \
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#define COMPRESS32_16WAY( rounds ) \
|
||||
do { \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
__m512i shuf_bswap32; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = m512_const1_64( 0x85A308D385A308D3 ); \
|
||||
VA = m512_const1_64( 0x13198A2E13198A2E ); \
|
||||
VB = m512_const1_64( 0x0370734403707344 ); \
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
||||
m512_const1_64( 0xA4093822A4093822 ) ); \
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi32( T0 ),\
|
||||
m512_const1_64( 0x299F31D0299F31D0 ) ); \
|
||||
VE = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
||||
m512_const1_64( 0x082EFA98082EFA98 ) ); \
|
||||
VF = _mm512_xor_si512( _mm512_set1_epi32( T1 ), \
|
||||
m512_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
|
||||
shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \
|
||||
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
|
||||
M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
|
||||
M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
|
||||
M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
|
||||
M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
|
||||
M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
|
||||
M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
|
||||
M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
|
||||
M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
|
||||
MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
|
||||
MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
|
||||
MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
|
||||
MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
|
||||
ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
|
||||
MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
|
||||
ROUND_S_16WAY(0); \
|
||||
ROUND_S_16WAY(1); \
|
||||
ROUND_S_16WAY(2); \
|
||||
ROUND_S_16WAY(3); \
|
||||
ROUND_S_16WAY(4); \
|
||||
ROUND_S_16WAY(5); \
|
||||
ROUND_S_16WAY(6); \
|
||||
ROUND_S_16WAY(7); \
|
||||
if (rounds == 14) \
|
||||
{ \
|
||||
ROUND_S_16WAY(8); \
|
||||
ROUND_S_16WAY(9); \
|
||||
ROUND_S_16WAY(0); \
|
||||
ROUND_S_16WAY(1); \
|
||||
ROUND_S_16WAY(2); \
|
||||
ROUND_S_16WAY(3); \
|
||||
} \
|
||||
H0 = _mm512_xor_si512( _mm512_xor_si512( V8, V0 ), H0 ); \
|
||||
H1 = _mm512_xor_si512( _mm512_xor_si512( V9, V1 ), H1 ); \
|
||||
H2 = _mm512_xor_si512( _mm512_xor_si512( VA, V2 ), H2 ); \
|
||||
H3 = _mm512_xor_si512( _mm512_xor_si512( VB, V3 ), H3 ); \
|
||||
H4 = _mm512_xor_si512( _mm512_xor_si512( VC, V4 ), H4 ); \
|
||||
H5 = _mm512_xor_si512( _mm512_xor_si512( VD, V5 ), H5 ); \
|
||||
H6 = _mm512_xor_si512( _mm512_xor_si512( VE, V6 ), H6 ); \
|
||||
H7 = _mm512_xor_si512( _mm512_xor_si512( VF, V7 ), H7 ); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
// Blake-256 4 way
|
||||
@@ -916,6 +1054,179 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//Blake-256 16 way AVX512
|
||||
|
||||
static void
|
||||
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
sc->rounds = rounds;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // number of elements, sizeof/4
|
||||
DECL_STATE32_16WAY
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < buf_size - ptr )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>2), vdata, len>>2 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
READ_STATE32_16WAY(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>2), vdata, clen>>2 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>2);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_16WAY( sc->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE32_16WAY(sc);
|
||||
sc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
|
||||
if ( ptr <= 52 )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm512_or_si512( buf[52>>2],
|
||||
m512_const1_64( 0x0100000001000000ULL ) );
|
||||
buf[+56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
||||
buf[+60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
||||
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_512( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
|
||||
buf[56>>2] = mm512_bswap_32( _mm512_set1_epi32( th ) );
|
||||
buf[60>>2] = mm512_bswap_32( _mm512_set1_epi32( tl ) );
|
||||
blake32_16way( sc, buf, 64 );
|
||||
}
|
||||
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_16way_close_update(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void blake256r14_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_16way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_16way_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
void blake256r8_16way_init(void *cc)
|
||||
{
|
||||
blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_16way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_16way(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_16way_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_16way_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
// default 14 rounds, backward copatibility
|
||||
|
@@ -42,20 +42,13 @@
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
|
||||
#define SPH_SMALL_FOOTPRINT_BLAKE 1
|
||||
#endif
|
||||
|
||||
#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
|
||||
#define SPH_COMPACT_BLAKE_64 1
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
// Blake-512
|
||||
|
||||
// Blake-512 common
|
||||
|
||||
/*
|
||||
static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
@@ -65,10 +58,6 @@ static const sph_u64 IV512[8] = {
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
||||
|
||||
// Blake-256 4 & 8 way, Blake-512 4 way
|
||||
|
||||
static const unsigned sigma[16][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
@@ -88,7 +77,17 @@ static const unsigned sigma[16][16] = {
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
#endif
|
||||
static const sph_u64 CB[16] = {
|
||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||
|
||||
*/
|
||||
|
||||
#define Z00 0
|
||||
#define Z01 1
|
||||
@@ -285,23 +284,6 @@ static const unsigned sigma[16][16] = {
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
/*
|
||||
#if SPH_COMPACT_BLAKE_64
|
||||
// not used
|
||||
static const sph_u64 CB[16] = {
|
||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||
};
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
#define READ_STATE64(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
H1 = (state)->H[1]; \
|
||||
@@ -338,7 +320,7 @@ static const sph_u64 CB[16] = {
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Blake-512 8 way
|
||||
// Blake-512 8 way AVX512
|
||||
|
||||
#define GB_8WAY(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = _mm512_add_epi64( _mm512_add_epi64( _mm512_xor_si512( \
|
||||
@@ -364,7 +346,6 @@ static const sph_u64 CB[16] = {
|
||||
GB_8WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m512i S0, S1, S2, S3; \
|
||||
@@ -443,9 +424,7 @@ static const sph_u64 CB[16] = {
|
||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
blake64_8way_init( blake_8way_big_context *sc, const sph_u64 *iv,
|
||||
const sph_u64 *salt )
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
@@ -511,20 +490,20 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way_close( blake_8way_big_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
||||
blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
{
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
uint64_t z, zz;
|
||||
// uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>3] = _mm512_set1_epi64( zz );
|
||||
// z = 0x80 >> n;
|
||||
// zz = ((ub & -z) | z) & 0xFF;
|
||||
// buf[ptr>>3] = _mm512_set1_epi64( zz );
|
||||
buf[ptr>>3] = m512_const1_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -544,11 +523,10 @@ blake64_8way_close( blake_8way_big_context *sc,
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[(104>>3)] = _mm512_or_si512( buf[(104>>3)],
|
||||
buf[104>>3] = _mm512_or_si512( buf[104>>3],
|
||||
m512_const1_64( 0x0100000000000000ULL ) );
|
||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
@@ -560,22 +538,15 @@ blake64_8way_close( blake_8way_big_context *sc,
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||
*(buf+(112>>3)) = _mm512_set1_epi64( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm512_set1_epi64( bswap_64( tl ) );
|
||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m512_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_8way( sc, buf, 128 );
|
||||
}
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_init(void *cc)
|
||||
{
|
||||
blake64_8way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
@@ -591,7 +562,7 @@ blake512_8way_close(void *cc, void *dst)
|
||||
void
|
||||
blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_8way_close(cc, ub, n, dst, 8);
|
||||
blake64_8way_close(cc, dst);
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
@@ -698,11 +669,8 @@ blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
H7 = mm256_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
//static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
const sph_u64 *salt )
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
@@ -713,12 +681,10 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -768,20 +734,16 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_4way_close( blake_4way_big_context *sc,
|
||||
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
|
||||
blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
{
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
z = 0x80 >> n;
|
||||
zz = ((ub & -z) | z) & 0xFF;
|
||||
buf[ptr>>3] = _mm256_set1_epi64x( zz );
|
||||
buf[ptr>>3] = m256_const1_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
@@ -798,40 +760,41 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
{
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
}
|
||||
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
|
||||
buf[104>>3] = _mm256_or_si256( buf[104>>3],
|
||||
m256_const1_64( 0x0100000000000000ULL ) );
|
||||
*(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
|
||||
*(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
*(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );
|
||||
buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = m256_const1_64( bswap_64( th ) );
|
||||
buf[120>>3] = m256_const1_64( bswap_64( tl ) );
|
||||
|
||||
blake64_4way( sc, buf, 128 );
|
||||
}
|
||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
blake512_4way_init(void *cc)
|
||||
{
|
||||
blake64_4way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
blake512_4way_update(void *cc, const void *data, size_t len)
|
||||
@@ -842,15 +805,18 @@ blake512_4way_update(void *cc, const void *data, size_t len)
|
||||
void
|
||||
blake512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake512_4way_addbits_and_close(cc, 0, 0, dst);
|
||||
blake64_4way_close( cc, dst );
|
||||
|
||||
// blake512_4way_addbits_and_close(cc, dst);
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -64,7 +64,8 @@ typedef bmw_4way_small_context bmw256_4way_context;
|
||||
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx );
|
||||
|
||||
void bmw256_4way(void *cc, const void *data, size_t len);
|
||||
void bmw256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw256_4way bmw256_4way_update
|
||||
|
||||
void bmw256_4way_close(void *cc, void *dst);
|
||||
|
||||
@@ -87,11 +88,33 @@ typedef struct {
|
||||
typedef bmw_8way_small_context bmw256_8way_context;
|
||||
|
||||
void bmw256_8way_init( bmw256_8way_context *ctx );
|
||||
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
|
||||
void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
#define bmw256_8way bmw256_8way_update
|
||||
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef bmw_16way_small_context bmw256_16way_context;
|
||||
|
||||
void bmw256_16way_init( bmw256_16way_context *ctx );
|
||||
void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
|
@@ -564,7 +564,7 @@ bmw256_4way_init(void *cc)
|
||||
*/
|
||||
|
||||
void
|
||||
bmw256_4way(void *cc, const void *data, size_t len)
|
||||
bmw256_4way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
bmw32_4way(cc, data, len);
|
||||
}
|
||||
@@ -1014,7 +1014,8 @@ void bmw256_8way_init( bmw256_8way_context *ctx )
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
|
||||
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
|
||||
void bmw256_8way_update( bmw256_8way_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buf;
|
||||
@@ -1092,6 +1093,513 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
|
||||
#define s16s0(x) \
|
||||
mm512_xor4( _mm512_srli_epi32( (x), 1), \
|
||||
_mm512_slli_epi32( (x), 3), \
|
||||
mm512_rol_32( (x), 4), \
|
||||
mm512_rol_32( (x), 19) )
|
||||
|
||||
#define s16s1(x) \
|
||||
mm512_xor4( _mm512_srli_epi32( (x), 1), \
|
||||
_mm512_slli_epi32( (x), 2), \
|
||||
mm512_rol_32( (x), 8), \
|
||||
mm512_rol_32( (x), 23) )
|
||||
|
||||
#define s16s2(x) \
|
||||
mm512_xor4( _mm512_srli_epi32( (x), 2), \
|
||||
_mm512_slli_epi32( (x), 1), \
|
||||
mm512_rol_32( (x), 12), \
|
||||
mm512_rol_32( (x), 25) )
|
||||
|
||||
#define s16s3(x) \
|
||||
mm512_xor4( _mm512_srli_epi32( (x), 2), \
|
||||
_mm512_slli_epi32( (x), 2), \
|
||||
mm512_rol_32( (x), 15), \
|
||||
mm512_rol_32( (x), 29) )
|
||||
|
||||
#define s16s4(x) \
|
||||
_mm512_xor_si512( (x), _mm512_srli_epi32( (x), 1 ) )
|
||||
|
||||
#define s16s5(x) \
|
||||
_mm512_xor_si512( (x), _mm512_srli_epi32( (x), 2 ) )
|
||||
|
||||
#define r16s1(x) mm512_rol_32( x, 3 )
|
||||
#define r16s2(x) mm512_rol_32( x, 7 )
|
||||
#define r16s3(x) mm512_rol_32( x, 13 )
|
||||
#define r16s4(x) mm512_rol_32( x, 16 )
|
||||
#define r16s5(x) mm512_rol_32( x, 19 )
|
||||
#define r16s6(x) mm512_rol_32( x, 23 )
|
||||
#define r16s7(x) mm512_rol_32( x, 27 )
|
||||
|
||||
#define mm512_rol_off_32( M, j, off ) \
|
||||
mm512_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_s16( M, H, j ) \
|
||||
_mm512_xor_si512( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( _mm512_add_epi32( mm512_rol_off_32( M, j, 0 ), \
|
||||
mm512_rol_off_32( M, j, 3 ) ), \
|
||||
mm512_rol_off_32( M, j, 10 ) ), \
|
||||
_mm512_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
#define expand1s16( qt, M, H, i ) \
|
||||
_mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
|
||||
mm512_add4_32( mm512_add4_32( s16s1( qt[ (i)-16 ] ), \
|
||||
s16s2( qt[ (i)-15 ] ), \
|
||||
s16s3( qt[ (i)-14 ] ), \
|
||||
s16s0( qt[ (i)-13 ] ) ), \
|
||||
mm512_add4_32( s16s1( qt[ (i)-12 ] ), \
|
||||
s16s2( qt[ (i)-11 ] ), \
|
||||
s16s3( qt[ (i)-10 ] ), \
|
||||
s16s0( qt[ (i)- 9 ] ) ), \
|
||||
mm512_add4_32( s16s1( qt[ (i)- 8 ] ), \
|
||||
s16s2( qt[ (i)- 7 ] ), \
|
||||
s16s3( qt[ (i)- 6 ] ), \
|
||||
s16s0( qt[ (i)- 5 ] ) ), \
|
||||
mm512_add4_32( s16s1( qt[ (i)- 4 ] ), \
|
||||
s16s2( qt[ (i)- 3 ] ), \
|
||||
s16s3( qt[ (i)- 2 ] ), \
|
||||
s16s0( qt[ (i)- 1 ] ) ) ) )
|
||||
|
||||
#define expand2s16( qt, M, H, i) \
|
||||
_mm512_add_epi32( add_elt_s16( M, H, (i)-16 ), \
|
||||
mm512_add4_32( mm512_add4_32( qt[ (i)-16 ], \
|
||||
r16s1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], \
|
||||
r16s2( qt[ (i)-13 ] ) ), \
|
||||
mm512_add4_32( qt[ (i)-12 ], \
|
||||
r16s3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], \
|
||||
r16s4( qt[ (i)- 9 ] ) ), \
|
||||
mm512_add4_32( qt[ (i)- 8 ], \
|
||||
r16s5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], \
|
||||
r16s6( qt[ (i)- 5 ] ) ), \
|
||||
mm512_add4_32( qt[ (i)- 4 ], \
|
||||
r16s7( qt[ (i)- 3 ] ), \
|
||||
s16s4( qt[ (i)- 2 ] ), \
|
||||
s16s5( qt[ (i)- 1 ] ) ) ) )
|
||||
|
||||
|
||||
#define W16s0 \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W16s1 \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 6], H[ 6] ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[14], H[14] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W16s2 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W16s3 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
|
||||
#define W16s4 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ), \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W16s5 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W16s6 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 4], H[ 4] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
|
||||
#define W16s7 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W16s8 \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[13], H[13] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W16s9 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 0], H[ 0] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[14], H[14] ) ) )
|
||||
|
||||
#define W16s10 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 1], H[ 1] ) ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 7], H[ 7] ), \
|
||||
_mm512_xor_si512( M[15], H[15] ) ) )
|
||||
|
||||
#define W16s11 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 8], H[ 8] ), \
|
||||
_mm512_xor_si512( M[ 0], H[ 0] ) ), \
|
||||
_mm512_xor_si512( M[ 2], H[ 2] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 5], H[ 5] ), \
|
||||
_mm512_xor_si512( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define W16s12 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[ 1], H[ 1] ), \
|
||||
_mm512_xor_si512( M[ 3], H[ 3] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[10], H[10] ) ) )
|
||||
|
||||
#define W16s13 \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[ 2], H[ 2] ), \
|
||||
_mm512_xor_si512( M[ 4], H[ 4] ) ), \
|
||||
_mm512_xor_si512( M[ 7], H[ 7] ) ), \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[10], H[10] ), \
|
||||
_mm512_xor_si512( M[11], H[11] ) ) )
|
||||
|
||||
#define W16s14 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 3], H[ 3] ), \
|
||||
_mm512_xor_si512( M[ 5], H[ 5] ) ), \
|
||||
_mm512_xor_si512( M[ 8], H[ 8] ) ), \
|
||||
_mm512_add_epi32( _mm512_xor_si512( M[11], H[11] ), \
|
||||
_mm512_xor_si512( M[12], H[12] ) ) )
|
||||
|
||||
#define W16s15 \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[12], H[12] ), \
|
||||
_mm512_xor_si512( M[ 4], H[4] ) ), \
|
||||
_mm512_xor_si512( M[ 6], H[ 6] ) ), \
|
||||
_mm512_sub_epi32( _mm512_xor_si512( M[ 9], H[ 9] ), \
|
||||
_mm512_xor_si512( M[13], H[13] ) ) )
|
||||
|
||||
void compress_small_16way( const __m512i *M, const __m512i H[16],
|
||||
__m512i dH[16] )
|
||||
{
|
||||
__m512i qt[32], xl, xh;
|
||||
|
||||
qt[ 0] = _mm512_add_epi32( s16s0( W16s0 ), H[ 1] );
|
||||
qt[ 1] = _mm512_add_epi32( s16s1( W16s1 ), H[ 2] );
|
||||
qt[ 2] = _mm512_add_epi32( s16s2( W16s2 ), H[ 3] );
|
||||
qt[ 3] = _mm512_add_epi32( s16s3( W16s3 ), H[ 4] );
|
||||
qt[ 4] = _mm512_add_epi32( s16s4( W16s4 ), H[ 5] );
|
||||
qt[ 5] = _mm512_add_epi32( s16s0( W16s5 ), H[ 6] );
|
||||
qt[ 6] = _mm512_add_epi32( s16s1( W16s6 ), H[ 7] );
|
||||
qt[ 7] = _mm512_add_epi32( s16s2( W16s7 ), H[ 8] );
|
||||
qt[ 8] = _mm512_add_epi32( s16s3( W16s8 ), H[ 9] );
|
||||
qt[ 9] = _mm512_add_epi32( s16s4( W16s9 ), H[10] );
|
||||
qt[10] = _mm512_add_epi32( s16s0( W16s10), H[11] );
|
||||
qt[11] = _mm512_add_epi32( s16s1( W16s11), H[12] );
|
||||
qt[12] = _mm512_add_epi32( s16s2( W16s12), H[13] );
|
||||
qt[13] = _mm512_add_epi32( s16s3( W16s13), H[14] );
|
||||
qt[14] = _mm512_add_epi32( s16s4( W16s14), H[15] );
|
||||
qt[15] = _mm512_add_epi32( s16s0( W16s15), H[ 0] );
|
||||
qt[16] = expand1s16( qt, M, H, 16 );
|
||||
qt[17] = expand1s16( qt, M, H, 17 );
|
||||
qt[18] = expand2s16( qt, M, H, 18 );
|
||||
qt[19] = expand2s16( qt, M, H, 19 );
|
||||
qt[20] = expand2s16( qt, M, H, 20 );
|
||||
qt[21] = expand2s16( qt, M, H, 21 );
|
||||
qt[22] = expand2s16( qt, M, H, 22 );
|
||||
qt[23] = expand2s16( qt, M, H, 23 );
|
||||
qt[24] = expand2s16( qt, M, H, 24 );
|
||||
qt[25] = expand2s16( qt, M, H, 25 );
|
||||
qt[26] = expand2s16( qt, M, H, 26 );
|
||||
qt[27] = expand2s16( qt, M, H, 27 );
|
||||
qt[28] = expand2s16( qt, M, H, 28 );
|
||||
qt[29] = expand2s16( qt, M, H, 29 );
|
||||
qt[30] = expand2s16( qt, M, H, 30 );
|
||||
qt[31] = expand2s16( qt, M, H, 31 );
|
||||
|
||||
xl = _mm512_xor_si512(
|
||||
mm512_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm512_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm512_xor_si512( xl, _mm512_xor_si512(
|
||||
mm512_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm512_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xh, sl ), \
|
||||
_mm512_srli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm512_add_epi32( \
|
||||
_mm512_xor_si512( M[m], \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xh, sl ), \
|
||||
_mm512_slli_epi32( qt[a], sr ) ) ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_slli_epi32( xl, sl ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm512_add_epi32( _mm512_add_epi32( \
|
||||
mm512_rol_32( dH[h], rl ), \
|
||||
_mm512_xor_si512( _mm512_xor_si512( xh, qt[a] ), M[m] )), \
|
||||
_mm512_xor_si512( _mm512_srli_epi32( xl, sr ), \
|
||||
_mm512_xor_si512( qt[b], qt[c] ) ) );
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||
|
||||
#undef DH1L
|
||||
#undef DH1R
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
|
||||
}
|
||||
|
||||
static const __m512i final_s16[16] =
|
||||
{
|
||||
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
|
||||
0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
|
||||
0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0,
|
||||
0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
||||
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
|
||||
0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
|
||||
0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1,
|
||||
0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
||||
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
|
||||
0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
|
||||
0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2,
|
||||
0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
|
||||
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
|
||||
0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
|
||||
0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3,
|
||||
0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
|
||||
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
|
||||
0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
|
||||
0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4,
|
||||
0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
|
||||
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
|
||||
0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
|
||||
0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5,
|
||||
0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
|
||||
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
|
||||
0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
|
||||
0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6,
|
||||
0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
|
||||
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
|
||||
0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
|
||||
0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7,
|
||||
0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
|
||||
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
|
||||
0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
|
||||
0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8,
|
||||
0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
|
||||
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
|
||||
0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
|
||||
0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9,
|
||||
0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
|
||||
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
|
||||
0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
|
||||
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
|
||||
0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
|
||||
0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab,
|
||||
0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
|
||||
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
|
||||
0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
|
||||
0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac,
|
||||
0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
|
||||
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
|
||||
0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
|
||||
0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad,
|
||||
0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
|
||||
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
|
||||
0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
|
||||
0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae,
|
||||
0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
|
||||
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
|
||||
0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
|
||||
0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf,
|
||||
0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
|
||||
};
|
||||
|
||||
|
||||
void bmw256_16way_init( bmw256_16way_context *ctx )
|
||||
{
|
||||
ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
|
||||
ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
|
||||
ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
|
||||
ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
|
||||
ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
|
||||
ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
|
||||
ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = m512_const1_64( 0x7071727370717273 );
|
||||
ctx->H[13] = m512_const1_64( 0x7475767774757677 );
|
||||
ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
|
||||
void bmw256_16way_update( bmw256_16way_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf;
|
||||
__m512i htmp[16];
|
||||
__m512i *h1, *h2;
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
ctx->bit_count += len << 3;
|
||||
buf = ctx->buf;
|
||||
ptr = ctx->ptr;
|
||||
h1 = ctx->H;
|
||||
h2 = htmp;
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>2), vdata, clen >> 2 );
|
||||
vdata = vdata + (clen>>2);
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m512i *ht;
|
||||
compress_small_16way( buf, h1, h2 );
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
h2 = ht;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
ctx->ptr = ptr;
|
||||
|
||||
if ( h1 != ctx->H )
|
||||
memcpy_512( ctx->H, h1, 16 );
|
||||
}
|
||||
|
||||
void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )
|
||||
{
|
||||
__m512i *buf;
|
||||
__m512i h1[16], h2[16], *h;
|
||||
size_t ptr, u, v;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
buf = ctx->buf;
|
||||
ptr = ctx->ptr;
|
||||
buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
h = ctx->H;
|
||||
|
||||
if ( ptr > (buf_size - 4) )
|
||||
{
|
||||
memset_zero_512( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
compress_small_16way( buf, h, h1 );
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset_zero_512( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = _mm512_set1_epi32( ctx->bit_count );
|
||||
buf[ (buf_size - 4) >> 2 ] = m512_zero;
|
||||
|
||||
compress_small_16way( buf, h, h2 );
|
||||
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[u] = h2[u];
|
||||
|
||||
compress_small_16way( buf, final_s16, h1 );
|
||||
for (u = 0, v = 16 - 8; u < 8; u ++, v ++)
|
||||
casti_m512i(dst,u) = h1[v];
|
||||
}
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -18,16 +18,17 @@ void bmw512hash_8way(void *state, const void *input)
|
||||
int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
// const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
@@ -39,7 +40,8 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
bmw512hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
@@ -48,15 +50,14 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
n += 8;
|
||||
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart) );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined(BMW512_4WAY)
|
||||
|
||||
//#ifdef BMW512_4WAY
|
||||
@@ -72,16 +73,17 @@ void bmw512hash_4way(void *state, const void *input)
|
||||
int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
// const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
@@ -92,7 +94,8 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
bmw512hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
@@ -103,9 +106,9 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
n += 4;
|
||||
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -58,8 +58,7 @@ static const sph_u64 IV512[] = {
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// BMW-512 2 way 64
|
||||
|
||||
// BMW-512 2 way 64
|
||||
|
||||
#define s2b0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
|
||||
@@ -824,87 +823,57 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
dH[ 0] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[0],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
|
||||
_mm256_srli_epi64( qt[16], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ) );
|
||||
dH[ 1] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[1],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
|
||||
_mm256_slli_epi64( qt[17], 8 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ) );
|
||||
dH[ 2] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[2],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
|
||||
_mm256_slli_epi64( qt[18], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ) );
|
||||
dH[ 3] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[3],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
|
||||
_mm256_slli_epi64( qt[19], 5 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ) );
|
||||
dH[ 4] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[4],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
|
||||
_mm256_slli_epi64( qt[20], 0 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ) );
|
||||
dH[ 5] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[5],
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
|
||||
_mm256_srli_epi64( qt[21], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ) );
|
||||
dH[ 6] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[6],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
|
||||
_mm256_slli_epi64( qt[22], 6 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ) );
|
||||
dH[ 7] = _mm256_add_epi64(
|
||||
_mm256_xor_si256( M[7],
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
|
||||
_mm256_slli_epi64( qt[23], 2 ) ) ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ) );
|
||||
dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[4], 9 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
|
||||
_mm256_xor_si256( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[5], 10 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
|
||||
_mm256_xor_si256( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[6], 11 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
|
||||
_mm256_xor_si256( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[7], 12 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
|
||||
_mm256_xor_si256( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[0], 13 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
|
||||
_mm256_xor_si256( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[1], 14 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
|
||||
_mm256_xor_si256( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[2], 15 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
|
||||
_mm256_xor_si256( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm256_add_epi64( _mm256_add_epi64(
|
||||
mm256_rol_64( dH[3], 16 ),
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
|
||||
_mm256_xor_si256( qt[22], qt[15] ) ) );
|
||||
}
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xh, sl ), \
|
||||
_mm256_srli_epi64( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
_mm256_add_epi64( \
|
||||
_mm256_xor_si256( M[m], \
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xh, sl ), \
|
||||
_mm256_slli_epi64( qt[a], sr ) ) ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
_mm256_add_epi64( _mm256_add_epi64( \
|
||||
mm256_rol_64( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_slli_epi64( xl, sl ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
_mm256_add_epi64( _mm256_add_epi64( \
|
||||
mm256_rol_64( dH[h], rl ), \
|
||||
_mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
|
||||
_mm256_xor_si256( _mm256_srli_epi64( xl, sr ), \
|
||||
_mm256_xor_si256( qt[b], qt[c] ) ) );
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||
|
||||
#undef DH1L
|
||||
#undef DH1R
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
}
|
||||
|
||||
static const __m256i final_b[16] =
|
||||
{
|
||||
|
@@ -28,6 +28,10 @@ static const uint64_t IV512[] =
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// 4 way 128 is handy to avoid reinterleaving in many algos.
|
||||
// If reinterleaving is necessary it may be more efficient to use
|
||||
// 2 way 256. The same transform code should work for both.
|
||||
|
||||
static void transform_4way( cube_4way_context *sp )
|
||||
{
|
||||
int r;
|
||||
@@ -201,6 +205,8 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// 2 way 128
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
|
@@ -1,203 +0,0 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include "cube-hash-2way.h"
|
||||
|
||||
// 2x128
|
||||
|
||||
|
||||
// The result of hashing 10 rounds of initial data which consists of params
|
||||
// zero padded.
|
||||
static const uint64_t IV256[] =
|
||||
{
|
||||
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
|
||||
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
|
||||
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
|
||||
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
|
||||
};
|
||||
|
||||
static const uint64_t IV512[] =
|
||||
{
|
||||
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
|
||||
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
|
||||
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
|
||||
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
|
||||
};
|
||||
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1;
|
||||
|
||||
x0 = _mm256_load_si256( (__m256i*)sp->h );
|
||||
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
|
||||
x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
|
||||
x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
|
||||
x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
|
||||
x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
|
||||
x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
|
||||
x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x1;
|
||||
x0 = mm256_rol_32( x2, 7 );
|
||||
x1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( y0, 7 );
|
||||
x3 = mm256_rol_32( y1, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap64_128( x4 );
|
||||
x5 = mm256_swap64_128( x5 );
|
||||
x6 = mm256_swap64_128( x6 );
|
||||
x7 = mm256_swap64_128( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x0;
|
||||
y1 = x2;
|
||||
x0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( y0, 11 );
|
||||
x2 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( y1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap32_64( x4 );
|
||||
x5 = mm256_swap32_64( x5 );
|
||||
x6 = mm256_swap32_64( x6 );
|
||||
x7 = mm256_swap32_64( x7 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 1, x1 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 2, x2 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 3, x3 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 4, x4 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = blockbytes/16;
|
||||
sp->rounds = rounds;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
{
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,36 +0,0 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
struct _cube_2way_context
|
||||
{
|
||||
__m256i h[8];
|
||||
int hashlen; // __m128i
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
} __attribute__ ((aligned (64)));
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_2way_reinit( cube_2way_context *sp );
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output );
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -1,6 +1,7 @@
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include "luffa-hash-2way.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -318,22 +319,6 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||
|
||||
MULT24W( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
||||
|
||||
MULT24W( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
||||
@@ -345,14 +330,10 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
|
||||
// replace with ror
|
||||
chainv[3] = _mm512_or_si512( _mm512_slli_epi32( chainv[3], 1 ),
|
||||
_mm512_srli_epi32( chainv[3], 31 ) );
|
||||
chainv[5] = _mm512_or_si512( _mm512_slli_epi32( chainv[5], 2 ),
|
||||
_mm512_srli_epi32( chainv[5], 30 ) );
|
||||
chainv[7] = _mm512_or_si512( _mm512_slli_epi32( chainv[7], 3 ),
|
||||
_mm512_srli_epi32( chainv[7], 29 ) );
|
||||
chainv[9] = _mm512_or_si512( _mm512_slli_epi32( chainv[9], 4 ),
|
||||
_mm512_srli_epi32( chainv[9], 28 ) );
|
||||
chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
|
||||
chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
|
||||
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
||||
chainv[9] = _mm512_rol_epi32( chainv[9], 4 );
|
||||
|
||||
NMLTOM10244W( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
@@ -394,7 +375,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
|
||||
void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8*4] __attribute((aligned(128)));
|
||||
uint32_t hash[8*4] __attribute((aligned(128)));
|
||||
__m512i* chainv = state->chainv;
|
||||
__m512i t[2];
|
||||
__m512i zero[2];
|
||||
@@ -424,7 +405,7 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[8], t[1] );
|
||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||
|
||||
casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||
@@ -448,7 +429,7 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
|
||||
t[1] = _mm512_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)&hash[0], t[0] );
|
||||
_mm512_store_si512( (__m512i*)&hash[8], t[1] );
|
||||
_mm512_store_si512( (__m512i*)&hash[16], t[1] );
|
||||
|
||||
casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
|
||||
casti_m512i( hash, 0 ), shuff_bswap32 );
|
||||
@@ -493,8 +474,8 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
state->rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
@@ -578,8 +559,9 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
}
|
||||
|
||||
finalization512_4way( state, (uint32*)output );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_4way( state, (uint32*)( output+32 ) );
|
||||
finalization512_4way( state, (uint32*)( output+64 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -860,14 +842,10 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
|
||||
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
|
||||
_mm256_srli_epi32( chainv[3], 31 ) );
|
||||
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
|
||||
_mm256_srli_epi32( chainv[5], 30 ) );
|
||||
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
|
||||
_mm256_srli_epi32( chainv[7], 29 ) );
|
||||
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
|
||||
_mm256_srli_epi32( chainv[9], 28 ) );
|
||||
chainv[3] = mm256_rol_32( chainv[3], 1 );
|
||||
chainv[5] = mm256_rol_32( chainv[5], 2 );
|
||||
chainv[7] = mm256_rol_32( chainv[7], 3 );
|
||||
chainv[9] = mm256_rol_32( chainv[9], 4 );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
@@ -1093,6 +1071,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
||||
}
|
||||
|
||||
finalization512_2way( state, (uint32*)output );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
||||
|
||||
|
@@ -1,573 +0,0 @@
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include "luffa-hash-2way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm256_xor_si256(a,c0);\
|
||||
b = _mm256_xor_si256(b,c1);\
|
||||
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
do { \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
|
||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
#define STEP_PART(x,c0,c1,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT(*x, *(x+4), c0, c1);
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = _mm256_load_si256(&a0);\
|
||||
a0 = _mm256_or_si256(a0,a1);\
|
||||
a2 = _mm256_xor_si256(a2,a3);\
|
||||
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
|
||||
a0 = _mm256_xor_si256(a0,a3);\
|
||||
a3 = _mm256_and_si256(a3,t);\
|
||||
a1 = _mm256_xor_si256(a1,a3);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a0);\
|
||||
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
|
||||
a2 = _mm256_xor_si256(a2,a1);\
|
||||
a1 = _mm256_or_si256(a1,a3);\
|
||||
t = _mm256_xor_si256(t,a1);\
|
||||
a3 = _mm256_xor_si256(a3,a2);\
|
||||
a2 = _mm256_and_si256(a2,a1);\
|
||||
a1 = _mm256_xor_si256(a1,a0);\
|
||||
a0 = _mm256_load_si256(&t);\
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,2);\
|
||||
t2 = _mm256_srli_epi32(a,30);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,14);\
|
||||
t2 = _mm256_srli_epi32(b,18);\
|
||||
b = _mm256_or_si256(t1,t2);\
|
||||
b = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(a,10);\
|
||||
t2 = _mm256_srli_epi32(a,22);\
|
||||
a = _mm256_or_si256(t1,t2);\
|
||||
a = _mm256_xor_si256(a,b);\
|
||||
t1 = _mm256_slli_epi32(b,1);\
|
||||
t2 = _mm256_srli_epi32(b,31);\
|
||||
b = _mm256_or_si256(t1,t2);
|
||||
|
||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm256_shuffle_epi32(a1,147);\
|
||||
t0 = _mm256_load_si256(&a1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm256_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm256_shuffle_epi32(t0,78);\
|
||||
a0 = _mm256_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm256_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm256_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm256_load_si256(&a1);\
|
||||
a0 = _mm256_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm256_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm256_shuffle_epi32(a1,57);\
|
||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT(a0,a1,c0,c1);
|
||||
|
||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm256_load_si256(&r1);\
|
||||
q2 = _mm256_load_si256(&p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,216);\
|
||||
p2 = _mm256_shuffle_epi32(p2,216);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
s2 = _mm256_unpackhi_epi32(s2,r0);\
|
||||
q2 = _mm256_unpackhi_epi32(q2,p0);\
|
||||
s0 = _mm256_load_si256(&r2);\
|
||||
q0 = _mm256_load_si256(&p2);\
|
||||
r2 = _mm256_unpacklo_epi64(r2,r1);\
|
||||
p2 = _mm256_unpacklo_epi64(p2,p1);\
|
||||
s1 = _mm256_load_si256(&s0);\
|
||||
q1 = _mm256_load_si256(&q0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p1);\
|
||||
r2 = _mm256_shuffle_epi32(r2,225);\
|
||||
p2 = _mm256_shuffle_epi32(p2,225);\
|
||||
r0 = _mm256_load_si256(&s1);\
|
||||
p0 = _mm256_load_si256(&q1);\
|
||||
s0 = _mm256_shuffle_epi32(s0,225);\
|
||||
q0 = _mm256_shuffle_epi32(q0,225);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,s2);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,q2);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,s2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,q2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s3 = _mm256_load_si256(&r2);\
|
||||
q3 = _mm256_load_si256(&p2);\
|
||||
|
||||
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
|
||||
s0 = _mm256_load_si256(&r0);\
|
||||
q0 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r2);\
|
||||
q1 = _mm256_load_si256(&p2);\
|
||||
r0 = _mm256_unpackhi_epi32(r0,r1);\
|
||||
p0 = _mm256_unpackhi_epi32(p0,p1);\
|
||||
r2 = _mm256_unpackhi_epi32(r2,r3);\
|
||||
p2 = _mm256_unpackhi_epi32(p2,p3);\
|
||||
s0 = _mm256_unpacklo_epi32(s0,r1);\
|
||||
q0 = _mm256_unpacklo_epi32(q0,p1);\
|
||||
s1 = _mm256_unpacklo_epi32(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi32(q1,p3);\
|
||||
r1 = _mm256_load_si256(&r0);\
|
||||
p1 = _mm256_load_si256(&p0);\
|
||||
r0 = _mm256_unpackhi_epi64(r0,r2);\
|
||||
p0 = _mm256_unpackhi_epi64(p0,p2);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,s1);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,q1);\
|
||||
r1 = _mm256_unpacklo_epi64(r1,r2);\
|
||||
p1 = _mm256_unpacklo_epi64(p1,p2);\
|
||||
s2 = _mm256_load_si256(&r0);\
|
||||
q2 = _mm256_load_si256(&p0);\
|
||||
s1 = _mm256_load_si256(&r1);\
|
||||
q1 = _mm256_load_si256(&p1);\
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm256_load_si256(&r3);\
|
||||
q1 = _mm256_load_si256(&p3);\
|
||||
s3 = _mm256_load_si256(&r3);\
|
||||
q3 = _mm256_load_si256(&p3);\
|
||||
s1 = _mm256_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm256_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm256_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm256_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm256_load_si256(&s1);\
|
||||
q0 = _mm256_load_si256(&q1);\
|
||||
s2 = _mm256_load_si256(&s3);\
|
||||
q2 = _mm256_load_si256(&q3);\
|
||||
r3 = _mm256_load_si256(&r1);\
|
||||
p3 = _mm256_load_si256(&p1);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm256_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm256_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm256_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm256_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm256_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm256_unpacklo_epi64(q3,p1);
|
||||
|
||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(32))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
|
||||
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
|
||||
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
|
||||
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
|
||||
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
|
||||
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
|
||||
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
|
||||
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
|
||||
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
|
||||
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
|
||||
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
|
||||
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
|
||||
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
|
||||
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
|
||||
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
|
||||
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
|
||||
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
|
||||
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
|
||||
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
|
||||
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
|
||||
0x00000000,0x00000000,0x00000000,0x5090d577,
|
||||
0x00000000,0x00000000,0x00000000,0xac11d7fa,
|
||||
0x00000000,0x00000000,0x00000000,0x2d1925ab,
|
||||
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
|
||||
0x00000000,0x00000000,0x00000000,0xb46496ac,
|
||||
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
|
||||
0x00000000,0x00000000,0x00000000,0xd1925ab0,
|
||||
0x00000000,0x00000000,0x00000000,0x78602649,
|
||||
0x00000000,0x00000000,0x00000000,0x29131ab6,
|
||||
0x00000000,0x00000000,0x00000000,0x8edae952,
|
||||
0x00000000,0x00000000,0x00000000,0x0fc053c3,
|
||||
0x00000000,0x00000000,0x00000000,0x3b6ba548,
|
||||
0x00000000,0x00000000,0x00000000,0x3f014f0c,
|
||||
0x00000000,0x00000000,0x00000000,0xedae9520,
|
||||
0x00000000,0x00000000,0x00000000,0xfc053c31
|
||||
};
|
||||
|
||||
|
||||
|
||||
/***************************************************/
|
||||
/* Round function */
|
||||
/* state: hash context */
|
||||
|
||||
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
{
|
||||
__m256i t0, t1;
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i tmp[2];
|
||||
__m256i x[8];
|
||||
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
t0 = _mm256_xor_si256( t0, chainv[2] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[3] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[4] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[5] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[6] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[7] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[8] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[9] );
|
||||
|
||||
MULT2( t0, t1, MASK );
|
||||
|
||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
||||
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
|
||||
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
|
||||
_mm256_srli_epi32( chainv[3], 31 ) );
|
||||
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
|
||||
_mm256_srli_epi32( chainv[5], 30 ) );
|
||||
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
|
||||
_mm256_srli_epi32( chainv[7], 29 ) );
|
||||
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
|
||||
_mm256_srli_epi32( chainv[9], 28 ) );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
|
||||
STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
|
||||
STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
|
||||
STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
|
||||
|
||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
|
||||
tmp[0], tmp[1] );
|
||||
}
|
||||
|
||||
/***************************************************/
|
||||
/* Finalization function */
|
||||
/* state: hash context */
|
||||
/* b[8]: hash values */
|
||||
|
||||
void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t[2];
|
||||
__m256i zero[2];
|
||||
zero[0] = zero[1] = m256_zero;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
|
||||
rnd512_2way( state, zero );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
|
||||
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 1 ), shuff_bswap32 );
|
||||
}
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
|
||||
{
|
||||
state->hashbitlen = hashbitlen;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m256_const1_128( iv[0] );
|
||||
state->chainv[1] = m256_const1_128( iv[1] );
|
||||
state->chainv[2] = m256_const1_128( iv[2] );
|
||||
state->chainv[3] = m256_const1_128( iv[3] );
|
||||
state->chainv[4] = m256_const1_128( iv[4] );
|
||||
state->chainv[5] = m256_const1_128( iv[5] );
|
||||
state->chainv[6] = m256_const1_128( iv[6] );
|
||||
state->chainv[7] = m256_const1_128( iv[7] );
|
||||
state->chainv[8] = m256_const1_128( iv[8] );
|
||||
state->chainv[9] = m256_const1_128( iv[9] );
|
||||
|
||||
((__m256i*)state->buffer)[0] = m256_zero;
|
||||
((__m256i*)state->buffer)[1] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Do not call luffa_update_close after having called luffa_update.
|
||||
// Once luffa_update has been called only call luffa_update or luffa_close.
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
int blocks = (int)len >> 5;
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
state-> rembytes = (int)len & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
// store in buffer for transform in final for midstate to work
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
||||
{
|
||||
__m256i *buffer = (__m256i*)state->buffer;
|
||||
__m256i msg[2];
|
||||
|
||||
// transform pad block
|
||||
if ( state->rembytes )
|
||||
// not empty, data is in buffer
|
||||
rnd512_2way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
finalization512_2way( state, (uint32*)hashval );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( hashval+32 ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_update_close( luffa_2way_context *state,
|
||||
void *output, const void *data, size_t inlen )
|
||||
{
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
const __m256i *vdata = (__m256i*)data;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
finalization512_2way( state, (uint32*)output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,69 +0,0 @@
|
||||
#if !defined(LUFFA_HASH_2WAY_H__)
|
||||
#define LUFFA_HASH_2WAY_H__ 1
|
||||
/*
|
||||
* luffa_for_sse2.h
|
||||
* Version 2.0 (Sep 15th 2009)
|
||||
*
|
||||
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
|
||||
*
|
||||
* Hitachi, Ltd. is the owner of this software and hereby grant
|
||||
* the U.S. Government and any interested party the right to use
|
||||
* this software for the purposes of the SHA-3 evaluation process,
|
||||
* notwithstanding that this software is copyrighted.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/* The length of digests*/
|
||||
#define DIGEST_BIT_LEN_224 224
|
||||
#define DIGEST_BIT_LEN_256 256
|
||||
#define DIGEST_BIT_LEN_384 384
|
||||
#define DIGEST_BIT_LEN_512 512
|
||||
|
||||
/*********************************/
|
||||
/* The parameters of Luffa */
|
||||
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
|
||||
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
|
||||
* of a message block*/
|
||||
|
||||
/* The number of blocks in Luffa */
|
||||
#define WIDTH_224 3
|
||||
#define WIDTH_256 3
|
||||
#define WIDTH_384 4
|
||||
#define WIDTH_512 5
|
||||
|
||||
/* The limit of the length of message */
|
||||
#define LIMIT_224 64
|
||||
#define LIMIT_256 64
|
||||
#define LIMIT_384 128
|
||||
#define LIMIT_512 128
|
||||
/*********************************/
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2] __attribute((aligned(64)));
|
||||
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} luffa_2way_context;
|
||||
|
||||
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
|
||||
int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval );
|
||||
int luffa_2way_update_close( luffa_2way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#endif
|
||||
#endif
|
715
algo/lyra2/lyra2-hash-2way.c
Normal file
715
algo/lyra2/lyra2-hash-2way.c
Normal file
@@ -0,0 +1,715 @@
|
||||
/**
|
||||
* Implementation of the Lyra2 Password Hashing Scheme (PHS).
|
||||
*
|
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
*
|
||||
* This software is hereby placed in the public domain.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "compat.h"
|
||||
#include "lyra2.h"
|
||||
#include "sponge.h"
|
||||
|
||||
/**
|
||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
|
||||
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
|
||||
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
|
||||
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
|
||||
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
|
||||
*
|
||||
* @param K The derived key to be output by the algorithm
|
||||
* @param kLen Desired key length
|
||||
* @param pwd User password
|
||||
* @param pwdlen Password length
|
||||
* @param salt Salt
|
||||
* @param saltlen Salt length
|
||||
* @param timeCost Parameter to determine the processing time (T)
|
||||
* @param nRows Number or rows of the memory matrix (R)
|
||||
* @param nCols Number of columns of the memory matrix (C)
|
||||
*
|
||||
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
|
||||
*/
|
||||
|
||||
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||
const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
// int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
//====================================================================/
|
||||
|
||||
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
- (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = pwdlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = saltlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = timeCost;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nRows;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nCols;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
// from here on it's all simd acces to state and matrix
|
||||
// define vector pointers and adjust sizes and pointer offsets
|
||||
|
||||
//================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
|
||||
// initState( state );
|
||||
|
||||
//========================= Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
/*
|
||||
for (i = 0; i < nBlocksInput; i++)
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
*/
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do
|
||||
{
|
||||
//Selects a pseudorandom index row*
|
||||
//-----------------------------------------------
|
||||
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
|
||||
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//----------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//----------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
|
||||
// 2 way 256
|
||||
// drop salt, salt len arguments, hard code some others.
|
||||
// Data is interleaved 2x256.
|
||||
|
||||
int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
|
||||
const void *pwd, const uint64_t pwdlen, const void *salt,
|
||||
const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
// int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
uint64_t instance0 = 0; // Seperate instance for each lane
|
||||
uint64_t instance1 = 0;
|
||||
//====================================================================/
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
// 2 way 256 rewrite. Salt always == password, and data is interleaved,
|
||||
// need to build in parallel:
|
||||
// { password, (64 or 80 bytes)
|
||||
// salt, (64 or 80 bytes) = same as password
|
||||
// Klen, (u64) = 32 bytes
|
||||
// pwdlen, (u64)
|
||||
// saltlen, (u64)
|
||||
// timecost, (u64)
|
||||
// nrows, (u64)
|
||||
// ncols, (u64)
|
||||
// 0x80, (byte)
|
||||
// { 0 .. 0 },
|
||||
// 1 (byte)
|
||||
// }
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
- (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = pwdlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = saltlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = timeCost;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nRows;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nCols;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
// from here on it's all simd acces to state and matrix
|
||||
// define vector pointers and adjust sizes and pointer offsets
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
|
||||
prev = row;
|
||||
row++;
|
||||
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
row = 0;
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
|
||||
do
|
||||
{
|
||||
// This part is not parallel, rowa will be different for each lane.
|
||||
// state (u64[16]) is interleaved 2x256, need to extract seperately.
|
||||
|
||||
// index = 2 * instance / 4 * 4 + instance % 4
|
||||
uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
|
||||
+ ( instance0 & 0x3 )
|
||||
uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
|
||||
+ ( instance1 & 0x3 )
|
||||
|
||||
instance0 = state[ index0 ] & 0xf;
|
||||
instance1 = (state+4)[ index1 ] & 0xf;
|
||||
|
||||
rowa0 = state[ instance0 ];
|
||||
rowa1 = (state+4)[ instance1 ];
|
||||
|
||||
reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa0*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa1*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
/*
|
||||
instance = state[instance & 0xF];
|
||||
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
|
||||
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
*/
|
||||
// End of divergence.
|
||||
|
||||
prev = row;
|
||||
row = (row + step) & (unsigned int)(nRows-1);
|
||||
|
||||
} while ( row != 0 );
|
||||
}
|
||||
|
||||
absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
|
||||
squeeze( state, K, (unsigned int) kLen );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
|
||||
const uint64_t timeCost, const uint64_t nRows,
|
||||
const uint64_t nCols )
|
||||
{
|
||||
//========================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
// int64_t i; //auxiliary iteration counter
|
||||
//=======================================================================/
|
||||
|
||||
//======= Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
|
||||
|
||||
//==== Getting the password + salt + basil padded with 10*1 ============//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
|
||||
sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &nRows, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
memcpy(ptrByte, &nCols, sizeof (uint64_t));
|
||||
ptrByte += sizeof (uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
//=================== Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
|
||||
// if (state == NULL) {
|
||||
// return -1;
|
||||
// }
|
||||
// initState( state );
|
||||
|
||||
//============================== Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
|
||||
BLOCK_LEN_BLAKE2_SAFE_INT64 );
|
||||
/*
|
||||
for ( i = 0; i < nBlocksInput; i++ )
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
*/
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
|
||||
reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
|
||||
|
||||
do {
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0) {
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//======================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for ( tau = 1; tau <= timeCost; tau++ )
|
||||
{
|
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do {
|
||||
//Selects a pseudorandom index row*
|
||||
//----------------------------------------------------------------------
|
||||
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-----------------------------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
|
||||
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//---------------------------------------------------------------
|
||||
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//========================= Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
|
||||
//Squeezes the key
|
||||
squeeze( state, K, kLen );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Lyra2RE doesn't like the new wholeMatrix implementation
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
||||
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[16];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
//====================================================================/
|
||||
|
||||
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
#if defined(__AVX2__)
|
||||
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
|
||||
#elif defined(__SSE2__)
|
||||
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
|
||||
#else
|
||||
memset( wholeMatrix, 0, i );
|
||||
#endif
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
|
||||
//=== Getting the password + salt + basil padded with 10*1 ==========//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix;
|
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen);
|
||||
ptrByte += pwdlen;
|
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen);
|
||||
ptrByte += saltlen;
|
||||
|
||||
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
|
||||
// - (saltlen + pwdlen) );
|
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = pwdlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = saltlen;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = timeCost;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nRows;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
v64 = nCols;
|
||||
memcpy(ptrByte, &v64, sizeof(int64_t));
|
||||
ptrByte += sizeof(uint64_t);
|
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
|
||||
//================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
|
||||
// initState( state );
|
||||
|
||||
//========================= Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
|
||||
ptrWord = wholeMatrix;
|
||||
|
||||
absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
/*
|
||||
for (i = 0; i < nBlocksInput; i++)
|
||||
{
|
||||
absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||
}
|
||||
*/
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
|
||||
nCols);
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
|
||||
do
|
||||
{
|
||||
//Selects a pseudorandom index row*
|
||||
//-----------------------------------------------
|
||||
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
|
||||
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//-------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
|
||||
&wholeMatrix[rowa*ROW_LEN_INT64],
|
||||
&wholeMatrix[row*ROW_LEN_INT64], nCols );
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//----------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//----------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
_mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
319
algo/lyra2/sponge-2way.c
Normal file
319
algo/lyra2/sponge-2way.c
Normal file
@@ -0,0 +1,319 @@
|
||||
/**
|
||||
* A simple implementation of Blake2b's internal permutation
|
||||
* in the form of a sponge.
|
||||
*
|
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
*
|
||||
* This software is hereby placed in the public domain.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
|
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "algo-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <immintrin.h>
|
||||
#include "sponge.h"
|
||||
#include "lyra2.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
{
|
||||
const int len_m256i = len / 32;
|
||||
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
|
||||
__m512i* state = (__m512i*)State;
|
||||
__m512i* out = (__m512i*)Out;
|
||||
int i;
|
||||
|
||||
//Squeezes full blocks
|
||||
for ( i = 0; i < fullBlocks; i++ )
|
||||
{
|
||||
memcpy_512( out, state, BLOCK_LEN_M256I*2 );
|
||||
LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
|
||||
out += BLOCK_LEN_M256I*2;
|
||||
}
|
||||
//Squeezes remaining bytes
|
||||
memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
|
||||
}
|
||||
|
||||
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In )
|
||||
{
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i *in = (__m512i*)In;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
state0 = _mm512_xor_si512( state0, in[0] );
|
||||
state1 = _mm512_xor_si512( state1, in[1] );
|
||||
state2 = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
|
||||
}
|
||||
|
||||
inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
|
||||
const uint64_t nBlocks, const uint64_t block_len )
|
||||
{
|
||||
register __m512i state0, state1, state2, state3;
|
||||
|
||||
state0 =
|
||||
state1 = m512_zero;
|
||||
state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
|
||||
0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
|
||||
0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
|
||||
for ( int i = 0; i < nBlocks; i++ )
|
||||
{
|
||||
__m512i *in = (__m512i*)In;
|
||||
state0 = _mm512_xor_si512( state0, in[0] );
|
||||
state1 = _mm512_xor_si512( state1, in[1] );
|
||||
|
||||
LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
In += block_len * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
|
||||
}
|
||||
|
||||
inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
//M[row][C-1-col] = H.reduced_squeeze()
|
||||
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
for ( i = 0; i < 9; i += 3)
|
||||
{
|
||||
_mm_prefetch( out - i, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - i - 2, _MM_HINT_T0 );
|
||||
}
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
_mm_prefetch( out - 9, _MM_HINT_T0 );
|
||||
_mm_prefetch( out - 11, _MM_HINT_T0 );
|
||||
|
||||
out[0] = state0;
|
||||
out[1] = state1;
|
||||
out[2] = state2;
|
||||
|
||||
//Goes to next block (column) that will receive the squeezed data
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
// This function has to deal with gathering 2 256 bit rowin vectors from
|
||||
// non-contiguous memory. Extra work and performance penalty.
|
||||
|
||||
inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i *in = (__m256i*)rowIn;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
state0 = _mm512_xor_si512( state0, in[0] );
|
||||
state1 = _mm512_xor_si512( state1, in[1] );
|
||||
state2 = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
//Input: next column (i.e., next block in sequence)
|
||||
in0 += BLOCK_LEN_M256I;
|
||||
in1 += BLOCK_LEN_M256I;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si256( (__m512i*)State, state0 );
|
||||
_mm512_store_si256( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si256( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si256( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
}
|
||||
|
||||
inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* in = (__m512i*)rowIn;
|
||||
__m512i* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
state0 = _mm512_xor_si512( state0,
|
||||
_mm512_add_epi64( in[0], inout[0] ) );
|
||||
state1 = _mm512_xor_si512( state1,
|
||||
_mm512_add_epi64( in[1], inout[1] ) );
|
||||
state2 = _mm512_xor_si512( state2,
|
||||
_mm512_add_epi64( in[2], inout[2] ) );
|
||||
|
||||
LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
|
||||
|
||||
out[0] = _mm512_xor_si512( state0, in[0] );
|
||||
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I * 2;
|
||||
inout += BLOCK_LEN_M256I * 2;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
|
||||
uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
|
||||
uint64_t nCols )
|
||||
{
|
||||
int i;
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m256i *in0 = (__m256i*)rowIn0;
|
||||
__m256i *in0 = (__m256i*)rowIn0;
|
||||
__m2512* in = (__m512i*)rowIn;
|
||||
__m2512* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut;
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
_mm_prefetch( in0, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in0 + 6, _MM_HINT_T0 );
|
||||
_mm_prefetch( in1 + 6, _MM_HINT_T0 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
|
||||
// state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
|
||||
// state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
|
||||
// state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
|
||||
t0 = mm512_concat_256( in1[0], in0[0] );
|
||||
t1 = mm512_concat_256( in1[1], in0[1] );
|
||||
t2 = mm512_concat_256( in1[2], in0[2] );
|
||||
|
||||
state0 = _mm512_xor_si512( state0,
|
||||
_mm512_add_epi64( t0, inout[0] ) );
|
||||
state1 = _mm512_xor_si512( state1,
|
||||
_mm512_add_epi64( t1, inout[1] ) );
|
||||
state2 = _mm512_xor_si512( state2,
|
||||
_mm512_add_epi64( t2, inout[2] ) );
|
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||
out[0] = _mm512_xor_si512( out[0], state0 );
|
||||
out[1] = _mm512_xor_si512( out[1], state1 );
|
||||
out[2] = _mm512_xor_si512( out[2], state2 );
|
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi32( t0, t2, 0x03 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi32( t1, t0, 0x03 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi32( t2, t1, 0x03 ) );
|
||||
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I * 2;
|
||||
out += BLOCK_LEN_M256I * 2;
|
||||
inout += BLOCK_LEN_M256I * 2;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
_mm512_store_si512( (__m512i*)State + 1, state1 );
|
||||
_mm512_store_si512( (__m512i*)State + 2, state2 );
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
#endif // AVX512
|
@@ -52,8 +52,46 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
// However, 2 way parallel looks trivial to code for AVX512 except for
|
||||
// a data dependency with rowa.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define G2W_4X64(a,b,c,d) \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_ror_1x64( s1); \
|
||||
s2 = mm512_swap128_256( s2 ); \
|
||||
s3 = mm512_rol1x64_256( s3 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s1 = mm512_rol1x64_256( s1 ); \
|
||||
s2 = mm512_swap128_256( s2 ); \
|
||||
s3 = mm512_ror1x64_256( s3 );
|
||||
|
||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined __AVX2__
|
||||
// only available with avx2
|
||||
|
||||
// process 4 columns in parallel
|
||||
// returns void, updates all args
|
||||
@@ -89,9 +127,11 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 )
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// process 2 columns in parallel
|
||||
// returns void, all args updated
|
||||
@@ -129,7 +169,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)
|
||||
|
||||
|
||||
#endif // AVX2 else SSE2
|
||||
@@ -161,6 +201,30 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
//---- Housekeeping
|
||||
void initState_2way( uint64_t state[/*16*/] );
|
||||
|
||||
//---- Squeezes
|
||||
void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
|
||||
void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );
|
||||
|
||||
//---- Absorbs
|
||||
void absorbBlock_2way( uint64_t *state, const uint64_t *in );
|
||||
void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
|
||||
const uint64_t nBlocks, const uint64_t block_len );
|
||||
|
||||
//---- Duplexes
|
||||
void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
|
||||
uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
|
||||
uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
|
||||
void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//---- Housekeeping
|
||||
void initState(uint64_t state[/*16*/]);
|
||||
|
||||
@@ -178,20 +242,4 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint6
|
||||
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
|
||||
|
||||
//---- Misc
|
||||
void printArray(unsigned char *array, unsigned int size, char *name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
////TESTS////
|
||||
//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
|
||||
//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
/////////////
|
||||
|
||||
|
||||
#endif /* SPONGE_H_ */
|
||||
|
@@ -5,7 +5,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
// #define HMQ1725_4WAY
|
||||
// #define HMQ1725_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_hmq1725_algo( algo_gate_t* gate );
|
||||
|
@@ -4,7 +4,8 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -13,73 +14,70 @@
|
||||
|
||||
typedef struct
|
||||
{
|
||||
luffa_4way_context luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
simd_2way_context simd2;
|
||||
hashState_echo echo;
|
||||
} qubit_4way_ctx_holder;
|
||||
|
||||
qubit_4way_ctx_holder qubit_4way_ctx;
|
||||
|
||||
void init_qubit_4way_ctx()
|
||||
{
|
||||
cubehashInit(&qubit_4way_ctx.cube,512,16,32);
|
||||
sph_shavite512_init(&qubit_4way_ctx.shavite);
|
||||
simd_4way_init( &qubit_4way_ctx.simd, 512 );
|
||||
init_echo(&qubit_4way_ctx.echo, 512);
|
||||
cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init(&qubit_4way_ctx.shavite);
|
||||
simd_4way_init( &qubit_4way_ctx.simd, 512 );
|
||||
simd_2way_init( &qubit_4way_ctx.simd2, 512 );
|
||||
init_echo(&qubit_4way_ctx.echo, 512);
|
||||
};
|
||||
|
||||
void qubit_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*4] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[16*4] __attribute__ ((aligned (128)));
|
||||
uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||
qubit_4way_ctx_holder ctx;
|
||||
|
||||
memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
|
||||
|
||||
luffa_4way_update( &ctx.luffa, input + (64<<2), 16 );
|
||||
luffa_4way_close( &ctx.luffa, vhash );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
|
||||
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
|
||||
memcpy( &ctx.shavite, &qubit_4way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
|
||||
memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
|
||||
@@ -92,71 +90,40 @@ void qubit_4way_hash( void *output, const void *input )
|
||||
int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[4*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[4*24] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[4*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t *noncep = vdata + 64+3; // 4*16 + 3
|
||||
int thr_id = mythr->id;
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
0xFFF, 0xFFFF, 0x10000000 };
|
||||
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
|
||||
0xFFFFF000, 0xFFFF0000, 0 };
|
||||
|
||||
casti_m512i( endiandata, 0 ) = mm512_bswap_32( casti_m512i( pdata, 0 ) );
|
||||
casti_m512i( endiandata, 1 ) = mm512_bswap_32( casti_m512i( pdata, 1 ) );
|
||||
casti_m512i( endiandata, 4 ) = mm512_bswap_32( casti_m512i( pdata, 4 ) );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
intrlv_4x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||
|
||||
mm512_bswap32_intrlv80_4x128( vdata, pdata );
|
||||
luffa_4way_init( &qubit_4way_ctx.luffa, 512 );
|
||||
luffa_4way_update( &qubit_4way_ctx.luffa, vdata, 64 );
|
||||
|
||||
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
do
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+4, n+1 );
|
||||
be32enc( noncep+8, n+2 );
|
||||
be32enc( noncep+12, n+3 );
|
||||
qubit_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
be32enc( noncep, n );
|
||||
be32enc( noncep+ 4, n+1 );
|
||||
be32enc( noncep+ 8, n+2 );
|
||||
be32enc( noncep+12, n+3 );
|
||||
|
||||
if ( !( hash[7] & mask ) )
|
||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_lane_solution( work, hash, mythr, 0 );
|
||||
}
|
||||
if ( !( (hash+8)[7] & mask ) )
|
||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_lane_solution( work, hash+8, mythr, 1 );
|
||||
}
|
||||
if ( !( hash+16[7] & mask ) )
|
||||
if ( fulltest( hash, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+2;
|
||||
submit_lane_solution( work, hash, mythr, 2 );
|
||||
}
|
||||
if ( !( (hash+24)[7] & mask ) )
|
||||
if ( fulltest( hash+8, ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+3;
|
||||
submit_lane_solution( work, hash+8, mythr, 3 );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
qubit_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash+(lane<<3) )[7] < Htarg )
|
||||
if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||
}
|
||||
n += 4;
|
||||
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -2,14 +2,12 @@
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate )
|
||||
{
|
||||
/*
|
||||
|
||||
#if defined (QUBIT_4WAY)
|
||||
init_qubit_2way_ctx();
|
||||
init_qubit_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit_4way;
|
||||
gate->hash = (void*)&qubit_4way_hash;
|
||||
#elif defined (QUBIT_4WAY)
|
||||
*/
|
||||
#if defined (QUBIT_2WAY)
|
||||
#elif defined (QUBIT_2WAY)
|
||||
init_qubit_2way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_qubit_2way;
|
||||
gate->hash = (void*)&qubit_2way_hash;
|
||||
@@ -18,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_qubit;
|
||||
gate->hash = (void*)&qubit_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,17 +4,15 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define QUBIT_2WAY 1
|
||||
#define QUBIT_4WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
*/
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define QUBIT_2WAY 1
|
||||
#endif
|
||||
|
||||
bool register_qubit_algo( algo_gate_t* gate );
|
||||
/*
|
||||
|
||||
#if defined(QUBIT_4WAY)
|
||||
|
||||
void qubit_4way_hash( void *state, const void *input );
|
||||
@@ -23,8 +21,6 @@ int scanhash_qubit_4way( struct work *work, uint32_t max_nonce,
|
||||
void init_qubit_4way_ctx();
|
||||
|
||||
#elif defined(QUBIT_2WAY)
|
||||
*/
|
||||
#if defined(QUBIT_2WAY)
|
||||
|
||||
void qubit_2way_hash( void *state, const void *input );
|
||||
int scanhash_qubit_2way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -735,7 +735,7 @@ do { \
|
||||
fft128_4way( a+512 );
|
||||
}
|
||||
|
||||
#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
#define c1_16_512( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
|
||||
void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
{
|
||||
@@ -744,8 +744,12 @@ void rounds512_4way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
|
||||
__m512i *S = (__m512i*) state;
|
||||
__m512i *M = (__m512i*) msg;
|
||||
__m512i *W = (__m512i*) fft;
|
||||
static const m512_v16 code[] = { c1_16(185), c1_16(233),
|
||||
c1_16(185), c1_16(233) };
|
||||
|
||||
static const m512_v16 code[] = { c1_16_512(185), c1_16_512(233) };
|
||||
|
||||
|
||||
// static const m512_v16 code[] = { c1_16(185), c1_16(233),
|
||||
// c1_16(185), c1_16(233) };
|
||||
|
||||
|
||||
S0l = _mm512_xor_si512( S[0], M[0] );
|
||||
@@ -999,7 +1003,9 @@ void SIMD_4way_Compress( simd_4way_context *state, const void *m, int final )
|
||||
{
|
||||
m512_v16 Y[32];
|
||||
uint16_t *y = (uint16_t*) Y[0].u16;
|
||||
|
||||
fft256_4way_msg( y, m, final );
|
||||
|
||||
rounds512_4way( state->A, m, y );
|
||||
}
|
||||
|
||||
@@ -1340,7 +1346,8 @@ do { \
|
||||
DO_REDUCE_FULL_S( 6 );
|
||||
DO_REDUCE_FULL_S( 7 );
|
||||
|
||||
#undef BUTTERFLY
|
||||
#undef BUTTERFLY_0
|
||||
#undef BUTTERFLY_N
|
||||
#undef DO_REDUCE
|
||||
|
||||
A[0] = X0;
|
||||
@@ -1491,6 +1498,7 @@ do { \
|
||||
|
||||
fft128_2way( a );
|
||||
fft128_2way( a+256 );
|
||||
|
||||
}
|
||||
|
||||
#define c1_16( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
||||
@@ -1751,7 +1759,9 @@ void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
|
||||
{
|
||||
m256_v16 Y[32];
|
||||
uint16_t *y = (uint16_t*) Y[0].u16;
|
||||
|
||||
fft256_2way_msg( y, m, final );
|
||||
|
||||
rounds512_2way( state->A, m, y );
|
||||
}
|
||||
|
||||
@@ -1864,6 +1874,7 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_2way_Compress( state, data, 0 );
|
||||
|
||||
databitlen -= bs;
|
||||
data += 2*( bs/8 );
|
||||
state->count += bs;
|
||||
@@ -1874,7 +1885,8 @@ int simd_2way_update_close( simd_2way_context *state, void *hashval,
|
||||
int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
|
||||
|
||||
memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
|
||||
state->count += databitlen;
|
||||
break;
|
||||
}
|
||||
|
@@ -1,11 +1,7 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "c11-gate.h"
|
||||
|
||||
#if defined (C11_4WAY)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -13,11 +9,237 @@
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
#if defined (C11_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
} c11_8way_ctx_holder;
|
||||
|
||||
c11_8way_ctx_holder c11_8way_ctx;
|
||||
|
||||
void init_c11_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &c11_8way_ctx.blake );
|
||||
bmw512_8way_init( &c11_8way_ctx.bmw );
|
||||
init_groestl( &c11_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &c11_8way_ctx.skein );
|
||||
jh512_8way_init( &c11_8way_ctx.jh );
|
||||
keccak512_8way_init( &c11_8way_ctx.keccak );
|
||||
luffa_4way_init( &c11_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_8way_ctx.shavite );
|
||||
simd_4way_init( &c11_8way_ctx.simd, 512 );
|
||||
init_echo( &c11_8way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void c11_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
c11_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_8way_ctx, sizeof(c11_8way_ctx) );
|
||||
|
||||
// 1 Blake 4way
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
blake512_8way_close( &ctx.blake, vhash );
|
||||
|
||||
// 2 Bmw
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
// 3 Groestl
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
// 4way
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
// 4 JH
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
// 5 Keccak
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
// 6 Skein
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
// 7 Luffa + 8 cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||
memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
// 10 Simd
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
|
||||
|
||||
// 11 Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||
(const BitSequence *) hash4, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||
(const BitSequence *) hash5, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||
(const BitSequence *) hash6, 512 );
|
||||
memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
|
||||
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
max_nonce -= 8;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
c11_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( ( ( hash+(i<<3) )[7] < Htarg )
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (C11_4WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
|
@@ -2,7 +2,11 @@
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (C11_4WAY)
|
||||
#if defined (C11_8WAY)
|
||||
init_c11_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_8way;
|
||||
gate->hash = (void*)&c11_8way_hash;
|
||||
#elif defined (C11_4WAY)
|
||||
init_c11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_c11_4way;
|
||||
gate->hash = (void*)&c11_4way_hash;
|
||||
@@ -11,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_c11;
|
||||
gate->hash = (void*)&c11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,29 +4,36 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define C11_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define C11_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define C11_4WAY 1
|
||||
#endif
|
||||
|
||||
|
||||
bool register_c11_algo( algo_gate_t* gate );
|
||||
#if defined(C11_8WAY)
|
||||
|
||||
#if defined(C11_4WAY)
|
||||
void c11_8way_hash( void *state, const void *input );
|
||||
int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_c11_8way_ctx();
|
||||
|
||||
#elif defined(C11_4WAY)
|
||||
|
||||
void c11_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_c11_4way_ctx();
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void c11_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_c11( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_c11_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -1,8 +1,5 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11-gate.h"
|
||||
|
||||
#if defined (X11_4WAY)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
@@ -12,11 +9,235 @@
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
#if defined (X11_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
} x11_8way_ctx_holder;
|
||||
|
||||
x11_8way_ctx_holder x11_8way_ctx;
|
||||
|
||||
void init_x11_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &x11_8way_ctx.blake );
|
||||
bmw512_8way_init( &x11_8way_ctx.bmw );
|
||||
init_groestl( &x11_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &x11_8way_ctx.skein );
|
||||
jh512_8way_init( &x11_8way_ctx.jh );
|
||||
keccak512_8way_init( &x11_8way_ctx.keccak );
|
||||
luffa_4way_init( &x11_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11_8way_ctx.shavite );
|
||||
simd_4way_init( &x11_8way_ctx.simd, 512 );
|
||||
init_echo( &x11_8way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void x11_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
|
||||
x11_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
blake512_8way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
memcpy( &ctx.groestl, &x11_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
// 4way
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
// Luffa + Cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||
memcpy( &ctx.shavite, &x11_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||
(const BitSequence *) hash4, 512 );
|
||||
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||
(const BitSequence *) hash5, 512 );
|
||||
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||
(const BitSequence *) hash6, 512 );
|
||||
memcpy( &ctx.echo, &x11_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
const uint32_t last_nonce = max_nonce -8;
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x11_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( ( hash+(i<<3) )[7] < Htarg
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#elif defined (X11_4WAY)
|
||||
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
|
@@ -1,8 +1,12 @@
|
||||
#include "x11-gate.h"
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate )
|
||||
bool register_x11_algo( algo_gate_t *gate )
|
||||
{
|
||||
#if defined (X11_4WAY)
|
||||
#if defined (X11_8WAY)
|
||||
init_x11_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11_8way;
|
||||
gate->hash = (void*)&x11_8way_hash;
|
||||
#elif defined (X11_4WAY)
|
||||
init_x11_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11_4way;
|
||||
gate->hash = (void*)&x11_4way_hash;
|
||||
@@ -11,7 +15,7 @@ bool register_x11_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x11;
|
||||
gate->hash = (void*)&x11_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,29 +4,35 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define X11_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define X11_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define X11_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_x11_algo( algo_gate_t* gate );
|
||||
#if defined(X11_8WAY)
|
||||
|
||||
#if defined(X11_4WAY)
|
||||
void x11_8way_hash( void *state, const void *input );
|
||||
int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_x11_8way_ctx();
|
||||
|
||||
#elif defined(X11_4WAY)
|
||||
|
||||
void x11_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_x11_4way_ctx();
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void x11_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_x11_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -1,11 +1,7 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
#if defined (X11GOST_4WAY)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
@@ -14,18 +10,269 @@
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
#if defined (X11GOST_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_8way_context blake;
|
||||
bmw512_8way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
keccak512_8way_context keccak;
|
||||
sph_gost512_context gost;
|
||||
luffa_4way_context luffa;
|
||||
cube_4way_context cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd_4way_context simd;
|
||||
hashState_echo echo;
|
||||
} x11gost_8way_ctx_holder;
|
||||
|
||||
x11gost_8way_ctx_holder x11gost_8way_ctx;
|
||||
|
||||
void init_x11gost_8way_ctx()
|
||||
{
|
||||
blake512_8way_init( &x11gost_8way_ctx.blake );
|
||||
bmw512_8way_init( &x11gost_8way_ctx.bmw );
|
||||
init_groestl( &x11gost_8way_ctx.groestl, 64 );
|
||||
skein512_8way_init( &x11gost_8way_ctx.skein );
|
||||
jh512_8way_init( &x11gost_8way_ctx.jh );
|
||||
keccak512_8way_init( &x11gost_8way_ctx.keccak );
|
||||
sph_gost512_init( &x11gost_8way_ctx.gost );
|
||||
luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
|
||||
cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11gost_8way_ctx.shavite );
|
||||
simd_4way_init( &x11gost_8way_ctx.simd, 512 );
|
||||
init_echo( &x11gost_8way_ctx.echo, 512 );
|
||||
}
|
||||
|
||||
void x11gost_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
|
||||
x11gost_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11gost_8way_ctx, sizeof(x11gost_8way_ctx) );
|
||||
|
||||
blake512_8way_update( &ctx.blake, input, 80 );
|
||||
blake512_8way_close( &ctx.blake, vhash );
|
||||
|
||||
bmw512_8way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_8way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
|
||||
|
||||
// 4way
|
||||
intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7 );
|
||||
|
||||
skein512_8way_update( &ctx.skein, vhash, 64 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
|
||||
keccak512_8way_update( &ctx.keccak, vhash, 64 );
|
||||
keccak512_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
// Serial
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash4, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash4 );
|
||||
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash5, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash5 );
|
||||
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash6, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash6 );
|
||||
memcpy( &ctx.gost, &x11gost_8way_ctx.gost, sizeof(sph_gost512_context) );
|
||||
sph_gost512( &ctx.gost, hash7, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash7 );
|
||||
|
||||
|
||||
// Luffa + Cube
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
luffa_4way_init( &ctx.luffa, 512 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash2, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash2 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash3, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash3 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash4, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash4 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash5, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash5 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash6, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash6 );
|
||||
memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
|
||||
sizeof(sph_shavite512_context) );
|
||||
sph_shavite512( &ctx.shavite, hash7, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash7 );
|
||||
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
simd_4way_init( &ctx.simd, 512 );
|
||||
simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||
(const BitSequence *) hash4, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||
(const BitSequence *) hash5, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||
(const BitSequence *) hash6, 512 );
|
||||
memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
max_nonce -= 8;
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
x11gost_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( ( hash+(i<<3) )[7] < Htarg
|
||||
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (X11GOST_4WAY)
|
||||
|
||||
typedef struct {
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
sph_gost512_context gost;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
@@ -76,10 +323,10 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl) );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
@@ -175,7 +422,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint64_t htmax[] = { 0, 0xF, 0xFF,
|
||||
@@ -185,7 +432,7 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
for (int m=0; m < 6; m++)
|
||||
for (int m=0; m < 6; m++)
|
||||
if (Htarg <= htmax[m])
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
|
@@ -2,7 +2,11 @@
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X11GOST_4WAY)
|
||||
#if defined (X11GOST_8WAY)
|
||||
init_x11gost_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost_8way;
|
||||
gate->hash = (void*)&x11gost_8way_hash;
|
||||
#elif defined (X11GOST_4WAY)
|
||||
init_x11gost_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost_4way;
|
||||
gate->hash = (void*)&x11gost_4way_hash;
|
||||
@@ -11,7 +15,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x11gost;
|
||||
gate->hash = (void*)&x11gost_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,29 +4,36 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define X11GOST_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define X11GOST_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define X11GOST_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(X11GOST_4WAY)
|
||||
#if defined(X11GOST_8WAY)
|
||||
|
||||
void x11gost_8way_hash( void *state, const void *input );
|
||||
int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_x11gost_8way_ctx();
|
||||
|
||||
#elif defined(X11GOST_4WAY)
|
||||
|
||||
void x11gost_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_x11gost_4way_ctx();
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void x11gost_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_x11gost( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_x11gost_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "phi1612-gate.h"
|
||||
|
||||
#if defined(PHI1612_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -9,10 +6,193 @@
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
|
||||
#if defined(PHI1612_8WAY)
|
||||
|
||||
typedef struct {
|
||||
skein512_8way_context skein;
|
||||
jh512_8way_context jh;
|
||||
cube_4way_context cube;
|
||||
sph_fugue512_context fugue;
|
||||
sph_gost512_context gost;
|
||||
hashState_echo echo;
|
||||
} phi1612_8way_ctx_holder;
|
||||
|
||||
phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_phi1612_8way_ctx()
|
||||
{
|
||||
skein512_8way_init( &phi1612_8way_ctx.skein );
|
||||
jh512_8way_init( &phi1612_8way_ctx.jh );
|
||||
cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 );
|
||||
sph_fugue512_init( &phi1612_8way_ctx.fugue );
|
||||
sph_gost512_init( &phi1612_8way_ctx.gost );
|
||||
init_echo( &phi1612_8way_ctx.echo, 512 );
|
||||
};
|
||||
|
||||
void phi1612_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
phi1612_8way_ctx_holder ctx;
|
||||
memcpy( &ctx, &phi1612_8way_ctx, sizeof(phi1612_8way_ctx) );
|
||||
|
||||
// Skein parallel 4way
|
||||
skein512_8way_update( &ctx.skein, input, 80 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
|
||||
// JH
|
||||
jh512_8way_update( &ctx.jh, vhash, 64 );
|
||||
jh512_8way_close( &ctx.jh, vhash );
|
||||
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhash );
|
||||
|
||||
// Cubehash
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
// Fugue
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash4, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash5, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash6, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash7, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||
|
||||
// Gost
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash2 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash3 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash4, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash4 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash5, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash5 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash6, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash6 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash7, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash7 );
|
||||
|
||||
// Echo
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
(const BitSequence *) hash0, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash1,
|
||||
(const BitSequence *) hash1, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash2,
|
||||
(const BitSequence *) hash2, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash3,
|
||||
(const BitSequence *) hash3, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash4,
|
||||
(const BitSequence *) hash4, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash5,
|
||||
(const BitSequence *) hash5, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash6,
|
||||
(const BitSequence *) hash6, 512 );
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash7,
|
||||
(const BitSequence *) hash7, 512 );
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+ 32, hash1, 32 );
|
||||
memcpy( state+ 64, hash2, 32 );
|
||||
memcpy( state+ 96, hash3, 32 );
|
||||
memcpy( state+128, hash4, 32 );
|
||||
memcpy( state+160, hash5, 32 );
|
||||
memcpy( state+192, hash6, 32 );
|
||||
memcpy( state+224, hash7, 32 );
|
||||
}
|
||||
|
||||
int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0cff;
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
phi1612_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= Htarg )
|
||||
if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n += 8;
|
||||
} while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(PHI1612_4WAY)
|
||||
|
||||
|
||||
typedef struct {
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
|
@@ -2,7 +2,11 @@
|
||||
|
||||
bool register_phi1612_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(PHI1612_4WAY)
|
||||
#if defined(PHI1612_8WAY)
|
||||
init_phi1612_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_phi1612_8way;
|
||||
gate->hash = (void*)&phi1612_8way_hash;
|
||||
#elif defined(PHI1612_4WAY)
|
||||
init_phi1612_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_phi1612_4way;
|
||||
gate->hash = (void*)&phi1612_4way_hash;
|
||||
@@ -11,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_phi1612;
|
||||
gate->hash = (void*)&phi1612_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -4,29 +4,35 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__) && defined(__AES__)
|
||||
#define PHI1612_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define PHI1612_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define PHI1612_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_phi1612_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(PHI1612_4WAY)
|
||||
#if defined(PHI1612_8WAY)
|
||||
|
||||
void phi1612_8way_hash( void *state, const void *input );
|
||||
int scanhash_phi1612_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_phi1612_8way_ctx();
|
||||
|
||||
#elif defined(PHI1612_4WAY)
|
||||
|
||||
void phi1612_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_phi1612_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_phi1612_4way_ctx();
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void phi1612_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_phi1612( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void init_phi1612_ctx();
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "skunk-gate.h"
|
||||
|
||||
#if defined(SKUNK_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -10,6 +7,146 @@
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
|
||||
#if defined(SKUNK_8WAY)
|
||||
|
||||
typedef struct {
|
||||
skein512_8way_context skein;
|
||||
cube_4way_context cube;
|
||||
sph_fugue512_context fugue;
|
||||
sph_gost512_context gost;
|
||||
} skunk_8way_ctx_holder;
|
||||
|
||||
static __thread skunk_8way_ctx_holder skunk_8way_ctx;
|
||||
|
||||
void skunk_8way_hash( void *output, const void *input )
|
||||
{
|
||||
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash4[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
|
||||
skunk_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &skunk_8way_ctx, sizeof(skunk_8way_ctx) );
|
||||
|
||||
skein512_8way_update( &ctx.skein, input, 80 );
|
||||
skein512_8way_close( &ctx.skein, vhash );
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, vhash, 512 );
|
||||
|
||||
intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
|
||||
cube_4way_init( &ctx.cube, 512, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
|
||||
dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
|
||||
|
||||
sph_fugue512( &ctx.fugue, hash0, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash0 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash1, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash1 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash2, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash2 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash3, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash3 );
|
||||
sph_fugue512( &ctx.fugue, hash4, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash4 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash5, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash5 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash6, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash6 );
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
sph_fugue512( &ctx.fugue, hash7, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash7 );
|
||||
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, output );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, output+ 32 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash2, 64 );
|
||||
sph_gost512_close( &ctx.gost, output+ 64 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash3, 64 );
|
||||
sph_gost512_close( &ctx.gost, output+ 96 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash4, 64 );
|
||||
sph_gost512_close( &ctx.gost, output+128 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash5, 64 );
|
||||
sph_gost512_close( &ctx.gost, output+160 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash6, 64 );
|
||||
sph_gost512_close( &ctx.gost, output+192 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash7, 64 );
|
||||
sph_gost512_close( &ctx.gost, output+224 );
|
||||
}
|
||||
|
||||
int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
volatile uint8_t *restart = &(work_restart[thr_id].restart);
|
||||
|
||||
if ( opt_benchmark )
|
||||
((uint32_t*)ptarget)[7] = 0x0cff;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
skunk_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
|
||||
if ( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_lane_solution( work, hash+(i<<3), mythr, i );
|
||||
}
|
||||
n +=8;
|
||||
} while ( likely( ( n < max_nonce-8 ) && !(*restart) ) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool skunk_8way_thread_init()
|
||||
{
|
||||
skein512_8way_init( &skunk_8way_ctx.skein );
|
||||
cube_4way_init( &skunk_8way_ctx.cube, 512, 16, 32 );
|
||||
sph_fugue512_init( &skunk_8way_ctx.fugue );
|
||||
sph_gost512_init( &skunk_8way_ctx.gost );
|
||||
return true;
|
||||
}
|
||||
|
||||
#elif defined(SKUNK_4WAY)
|
||||
|
||||
typedef struct {
|
||||
skein512_4way_context skein;
|
||||
|
@@ -2,12 +2,15 @@
|
||||
|
||||
bool register_skunk_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
#if defined (SKUNK_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined (SKUNK_8WAY)
|
||||
gate->miner_thread_init = (void*)&skunk_8way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_skunk_8way;
|
||||
gate->hash = (void*)&skunk_8way_hash;
|
||||
#elif defined (SKUNK_4WAY)
|
||||
gate->miner_thread_init = (void*)&skunk_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_skunk_4way;
|
||||
gate->hash = (void*)&skunk_4way_hash;
|
||||
// init_skunk_4way_ctx();
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&skunk_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_skunk;
|
||||
|
@@ -4,29 +4,33 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define SKUNK_4WAY
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SKUNK_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SKUNK_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_skunk_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(SKUNK_4WAY)
|
||||
#if defined(SKUNK_8WAY)
|
||||
|
||||
void skunk_8way_hash( void *state, const void *input );
|
||||
int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool skunk_8way_thread_init();
|
||||
|
||||
#elif defined(SKUNK_4WAY)
|
||||
|
||||
void skunk_4way_hash( void *state, const void *input );
|
||||
|
||||
int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
bool skunk_4way_thread_init();
|
||||
//void init_skunk_4way_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
void skunkhash( void *state, const void *input );
|
||||
|
||||
int scanhash_skunk( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
bool skunk_thread_init();
|
||||
|
||||
#endif
|
||||
|
@@ -127,6 +127,7 @@ void x17_4way_hash( void *state, const void *input )
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
|
||||
// 11 Echo serial
|
||||
init_echo( &ctx.echo, 512 );
|
||||
update_final_echo( &ctx.echo, (BitSequence *)hash0,
|
||||
|
@@ -29,7 +29,7 @@ rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure --with-curl
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-avx.exe
|
||||
mv cpuminer.exe cpuminer-avx.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-aes-avx
|
||||
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.1.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.10.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.10.1'
|
||||
PACKAGE_VERSION='3.10.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.10.2'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.10.1 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.10.1:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.10.1
|
||||
cpuminer-opt configure 3.10.2
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.10.1, which was
|
||||
It was created by cpuminer-opt $as_me 3.10.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.10.1'
|
||||
VERSION='3.10.2'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.10.1, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.10.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.10.1
|
||||
cpuminer-opt config.status 3.10.2
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.10.1])
|
||||
AC_INIT([cpuminer-opt], [3.10.2])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
@@ -3327,7 +3327,7 @@ static void show_credits()
|
||||
{
|
||||
printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n");
|
||||
printf(" A CPU miner with multi algo support and optimized for CPUs\n");
|
||||
printf(" with AES_NI and AVX2, AVX512 and SHA extensions.\n");
|
||||
printf(" with AES_NI, AVX2, AVX512 and SHA extensions.\n");
|
||||
printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n");
|
||||
}
|
||||
|
||||
@@ -3343,7 +3343,7 @@ bool check_cpu_capability ()
|
||||
bool cpu_has_avx512 = has_avx512();
|
||||
bool cpu_has_vaes = has_vaes();
|
||||
bool sw_has_aes = false;
|
||||
bool sw_has_sse2 = false;
|
||||
bool sw_has_sse2 = false;
|
||||
bool sw_has_sse42 = false;
|
||||
bool sw_has_avx = false;
|
||||
bool sw_has_avx2 = false;
|
||||
@@ -3412,8 +3412,8 @@ bool check_cpu_capability ()
|
||||
|
||||
printf("CPU features:");
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha ) printf( " SHA" );
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2" );
|
||||
else if ( cpu_has_avx ) printf( " AVX" );
|
||||
|
@@ -1528,6 +1528,58 @@ static inline void intrlv_8x64( void *dst, const void *src0,
|
||||
d[63] = _mm_unpackhi_epi64( s6[7], s7[7] );
|
||||
}
|
||||
|
||||
static inline void intrlv_8x64_512( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3,
|
||||
const void *src4, const void *src5, const void *src6,
|
||||
const void *src7 )
|
||||
{
|
||||
__m128i *d = (__m128i*)dst;
|
||||
const __m128i *s0 = (const __m128i*)src0;
|
||||
const __m128i *s1 = (const __m128i*)src1;
|
||||
const __m128i *s2 = (const __m128i*)src2;
|
||||
const __m128i *s3 = (const __m128i*)src3;
|
||||
const __m128i *s4 = (const __m128i*)src4;
|
||||
const __m128i *s5 = (const __m128i*)src5;
|
||||
const __m128i *s6 = (const __m128i*)src6;
|
||||
const __m128i *s7 = (const __m128i*)src7;
|
||||
|
||||
d[ 0] = _mm_unpacklo_epi64( s0[0], s1[0] );
|
||||
d[ 1] = _mm_unpacklo_epi64( s2[0], s3[0] );
|
||||
d[ 2] = _mm_unpacklo_epi64( s4[0], s5[0] );
|
||||
d[ 3] = _mm_unpacklo_epi64( s6[0], s7[0] );
|
||||
d[ 4] = _mm_unpackhi_epi64( s0[0], s1[0] );
|
||||
d[ 5] = _mm_unpackhi_epi64( s2[0], s3[0] );
|
||||
d[ 6] = _mm_unpackhi_epi64( s4[0], s5[0] );
|
||||
d[ 7] = _mm_unpackhi_epi64( s6[0], s7[0] );
|
||||
|
||||
d[ 8] = _mm_unpacklo_epi64( s0[1], s1[1] );
|
||||
d[ 9] = _mm_unpacklo_epi64( s2[1], s3[1] );
|
||||
d[10] = _mm_unpacklo_epi64( s4[1], s5[1] );
|
||||
d[11] = _mm_unpacklo_epi64( s6[1], s7[1] );
|
||||
d[12] = _mm_unpackhi_epi64( s0[1], s1[1] );
|
||||
d[13] = _mm_unpackhi_epi64( s2[1], s3[1] );
|
||||
d[14] = _mm_unpackhi_epi64( s4[1], s5[1] );
|
||||
d[15] = _mm_unpackhi_epi64( s6[1], s7[1] );
|
||||
|
||||
d[16] = _mm_unpacklo_epi64( s0[2], s1[2] );
|
||||
d[17] = _mm_unpacklo_epi64( s2[2], s3[2] );
|
||||
d[18] = _mm_unpacklo_epi64( s4[2], s5[2] );
|
||||
d[19] = _mm_unpacklo_epi64( s6[2], s7[2] );
|
||||
d[20] = _mm_unpackhi_epi64( s0[2], s1[2] );
|
||||
d[21] = _mm_unpackhi_epi64( s2[2], s3[2] );
|
||||
d[22] = _mm_unpackhi_epi64( s4[2], s5[2] );
|
||||
d[23] = _mm_unpackhi_epi64( s6[2], s7[2] );
|
||||
|
||||
d[24] = _mm_unpacklo_epi64( s0[3], s1[3] );
|
||||
d[25] = _mm_unpacklo_epi64( s2[3], s3[3] );
|
||||
d[26] = _mm_unpacklo_epi64( s4[3], s5[3] );
|
||||
d[27] = _mm_unpacklo_epi64( s6[3], s7[3] );
|
||||
d[28] = _mm_unpackhi_epi64( s0[3], s1[3] );
|
||||
d[29] = _mm_unpackhi_epi64( s2[3], s3[3] );
|
||||
d[30] = _mm_unpackhi_epi64( s4[3], s5[3] );
|
||||
d[31] = _mm_unpackhi_epi64( s6[3], s7[3] );
|
||||
}
|
||||
|
||||
/*
|
||||
#define ILEAVE_8x64( i ) do \
|
||||
{ \
|
||||
@@ -1656,6 +1708,57 @@ static inline void dintrlv_8x64( void *dst0, void *dst1, void *dst2,
|
||||
d7[7] = _mm_unpackhi_epi64( s[59], s[63] );
|
||||
}
|
||||
|
||||
static inline void dintrlv_8x64_512( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
|
||||
const void *src )
|
||||
{
|
||||
__m128i *d0 = (__m128i*)dst0;
|
||||
__m128i *d1 = (__m128i*)dst1;
|
||||
__m128i *d2 = (__m128i*)dst2;
|
||||
__m128i *d3 = (__m128i*)dst3;
|
||||
__m128i *d4 = (__m128i*)dst4;
|
||||
__m128i *d5 = (__m128i*)dst5;
|
||||
__m128i *d6 = (__m128i*)dst6;
|
||||
__m128i *d7 = (__m128i*)dst7;
|
||||
const __m128i* s = (const __m128i*)src;
|
||||
|
||||
d0[0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
|
||||
d1[0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
|
||||
d2[0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
|
||||
d3[0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
|
||||
d4[0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
|
||||
d5[0] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
|
||||
d6[0] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
|
||||
d7[0] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
|
||||
|
||||
d0[1] = _mm_unpacklo_epi64( s[ 8], s[12] );
|
||||
d1[1] = _mm_unpackhi_epi64( s[ 8], s[12] );
|
||||
d2[1] = _mm_unpacklo_epi64( s[ 9], s[13] );
|
||||
d3[1] = _mm_unpackhi_epi64( s[ 9], s[13] );
|
||||
d4[1] = _mm_unpacklo_epi64( s[10], s[14] );
|
||||
d5[1] = _mm_unpackhi_epi64( s[10], s[14] );
|
||||
d6[1] = _mm_unpacklo_epi64( s[11], s[15] );
|
||||
d7[1] = _mm_unpackhi_epi64( s[11], s[15] );
|
||||
|
||||
d0[2] = _mm_unpacklo_epi64( s[16], s[20] );
|
||||
d1[2] = _mm_unpackhi_epi64( s[16], s[20] );
|
||||
d2[2] = _mm_unpacklo_epi64( s[17], s[21] );
|
||||
d3[2] = _mm_unpackhi_epi64( s[17], s[21] );
|
||||
d4[2] = _mm_unpacklo_epi64( s[18], s[22] );
|
||||
d5[2] = _mm_unpackhi_epi64( s[18], s[22] );
|
||||
d6[2] = _mm_unpacklo_epi64( s[19], s[23] );
|
||||
d7[2] = _mm_unpackhi_epi64( s[19], s[23] );
|
||||
|
||||
d0[3] = _mm_unpacklo_epi64( s[24], s[28] );
|
||||
d1[3] = _mm_unpackhi_epi64( s[24], s[28] );
|
||||
d2[3] = _mm_unpacklo_epi64( s[25], s[29] );
|
||||
d3[3] = _mm_unpackhi_epi64( s[25], s[29] );
|
||||
d4[3] = _mm_unpacklo_epi64( s[26], s[30] );
|
||||
d5[3] = _mm_unpackhi_epi64( s[26], s[30] );
|
||||
d6[3] = _mm_unpacklo_epi64( s[27], s[31] );
|
||||
d7[3] = _mm_unpackhi_epi64( s[27], s[31] );
|
||||
}
|
||||
|
||||
/*
|
||||
#define DLEAVE_8x64( i ) do \
|
||||
{ \
|
||||
@@ -1910,6 +2013,32 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
|
||||
}
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
|
||||
{
|
||||
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
__m128i s2 = casti_m128i( src,2 );
|
||||
__m128i s3 = casti_m128i( src,3 );
|
||||
__m128i s4 = casti_m128i( src,4 );
|
||||
|
||||
s0 = _mm_shuffle_epi8( s0, bswap_shuf );
|
||||
s1 = _mm_shuffle_epi8( s1, bswap_shuf );
|
||||
s2 = _mm_shuffle_epi8( s2, bswap_shuf );
|
||||
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
|
||||
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
|
||||
|
||||
casti_m512i( d, 0 ) = _mm512_broadcast_i64x2( s0 );
|
||||
casti_m512i( d, 1 ) = _mm512_broadcast_i64x2( s1 );
|
||||
casti_m512i( d, 2 ) = _mm512_broadcast_i64x2( s2 );
|
||||
casti_m512i( d, 3 ) = _mm512_broadcast_i64x2( s3 );
|
||||
casti_m512i( d, 4 ) = _mm512_broadcast_i64x2( s4 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// 2x256 (AVX512)
|
||||
|
||||
#if defined (__AVX__)
|
||||
@@ -1946,6 +2075,9 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
|
||||
d0[3] = s[6]; d1[3] = s[7];
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif // AVX
|
||||
|
||||
///////////////////////////
|
||||
|
@@ -243,7 +243,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define mm128_ror_64 _mm_ror_epi64
|
||||
|
@@ -454,6 +454,13 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
// Swap 32 bit elements in each 64 bit lane
|
||||
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define mm256_rol1x16_64( v ) _mm256_rol_epi64( v, 16 )
|
||||
#define mm256_ror1x16_64( v ) _mm256_ror_epi64( v, 16 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_ror1x16_64( v ) \
|
||||
_mm256_shuffle_epi8( v, \
|
||||
m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
|
||||
@@ -463,6 +470,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
_mm256_shuffle_epi8( v, \
|
||||
m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
|
||||
0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
|
||||
#endif
|
||||
|
||||
#define mm256_ror1x8_64( v ) \
|
||||
_mm256_shuffle_epi8( v, \
|
||||
@@ -486,10 +494,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
|
||||
|
||||
|
||||
// Swap 16 bit elements in each 32 bit lane
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define mm256_swap16_32( v ) _mm256_rol_epi32( v, 16 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm256_swap16_32( v ) \
|
||||
_mm256_shuffle_epi8( v, \
|
||||
m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
|
||||
0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
|
||||
#endif
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements, endian bswap.
|
||||
|
@@ -13,20 +13,31 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// AVX512 intrinsics have a few peculiarities with permutes and shuffles
|
||||
// that are inconsistent with previous AVX2 implementations.
|
||||
// AVX512 intrinsics have a few changes from previous conventions.
|
||||
//
|
||||
// Some instructions like cmp and blend use the mask regsiters now instead
|
||||
// a vector mask.
|
||||
//
|
||||
// The new rotate instructions require the count to be only an 8 bit
|
||||
// immediate value. The documentation is the same as for shift and
|
||||
// it allows variables. Suspect a compiler issue but it still happens
|
||||
// in GCC9.
|
||||
//
|
||||
// _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
|
||||
// usually shuffles accross all lanes.
|
||||
//
|
||||
// Some instructions like cmp and blend use a mask regsiter now instead
|
||||
// a mask vector.
|
||||
//
|
||||
// permutexvar has args reversed, index is first arg. Previously all
|
||||
// permutes and shuffles have the source vector first.
|
||||
// permutes and shuffles have the index last.
|
||||
//
|
||||
// _mm512_permutexvar_epi8 requires AVX512-VBMI, larger elements don't.
|
||||
// It also performs the same op as _mm512_shuffle_epi8.
|
||||
//
|
||||
// _mm512_shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
|
||||
// doesn't cross 128 bit lane boundaries.
|
||||
// shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
|
||||
// doesn't cross 128 bit lane boundaries but is consistent with AVX2
|
||||
// where shuffle_epi8 spans the entire vector.
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
//
|
||||
@@ -35,7 +46,6 @@
|
||||
// Other AVX512 extensions that may be required for some functions.
|
||||
// __AVX512VBMI__ __AVX512VAES__
|
||||
//
|
||||
// Experimental, not fully tested.
|
||||
|
||||
// Move integer to/from element 0 of vector.
|
||||
|
||||
@@ -88,10 +98,19 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
||||
return mm512_concat_256( hi, lo );
|
||||
}
|
||||
|
||||
// Equivalent of set4, broadcast 256 bits in groups of four 64 bit constants
|
||||
// to all 256 bit lanes: {i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0}.
|
||||
// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
|
||||
#define m512_const1_256( i ) _mm512_broadcast_i64x4( i )
|
||||
#define m512_const1_128( i ) _mm512_broadcast_i64x2( i )
|
||||
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
|
||||
#define m512_const2_64( i1, i0 ) \
|
||||
m512_const1_128( m128_const_64( i1, i0 ) )
|
||||
|
||||
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
const uint64_t i1, const uint64_t i0 )
|
||||
const uint64_t i1, const uint64_t i0 )
|
||||
{
|
||||
__m256i lo = mm256_mov64_256( i0 );
|
||||
__m128i hi = mm128_mov64_128( i2 );
|
||||
@@ -99,25 +118,9 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
_mm_insert_epi64( _mm256_castsi256_si128(
|
||||
lo ), i1, 1 ) );
|
||||
hi = _mm_insert_epi64( hi, i3, 1 );
|
||||
return _mm512_permutex_epi64( _mm512_castsi256_si512(
|
||||
_mm256_inserti128_si256( lo, hi, 1 ) ), 0xe4 );
|
||||
return _mm512_broadcast_i64x4( _mm256_inserti128_si256( lo, hi, 1 ) );
|
||||
}
|
||||
|
||||
// Broadcast 128 bits in pairs of 64 bit constants {i1. i0} to all
|
||||
// 128 bit lanes.
|
||||
#define m512_const2_64( i1, i0 ) \
|
||||
_mm512_permutex_epi64( _mm512_castsi128_si512( \
|
||||
m128_const_64( i1, i0 ) ), 0x44 )
|
||||
|
||||
// Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
|
||||
#define m512_const1_256( i ) _mm512_broadcast_i64x4( i )
|
||||
#define m512_const1_128( i ) _mm512_broadcast_i64x2( i )
|
||||
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
|
||||
|
||||
//
|
||||
// Pseudo constants.
|
||||
|
||||
@@ -136,17 +139,6 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
|
||||
#define m512_neg1 m512_const1_64( 0xffffffffffffffff )
|
||||
|
||||
/*
|
||||
// EVEX vcmpeqq returns a bit mask instead of a vector
|
||||
static inline __m512i mm512_neg1_fn()
|
||||
{
|
||||
__m512i a;
|
||||
asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
|
||||
return a;
|
||||
}
|
||||
#define m512_neg1 mm512_neg1_fn()
|
||||
*/
|
||||
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
@@ -209,7 +201,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
|
||||
|
||||
// Horizontal vector testing
|
||||
// Returns bit mask
|
||||
// Returns bit __mmask8
|
||||
#define mm512_allbits0( a ) _mm512_cmpeq_epi64_mask( a, m512_zero )
|
||||
#define mm512_allbits1( a ) _mm512_cmpeq_epi64_mask( a, m512_neg1 )
|
||||
#define mm512_anybits0( a ) _mm512_cmpneq_epi64_mask( a, m512_neg1 )
|
||||
@@ -514,6 +506,12 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
#define mm512_swap32_64( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
|
||||
// Rotate each 64 bit lane by one 16 bit element.
|
||||
#define mm512_ror1x16_64( v ) _mm512_ror_epi64( v, 16 )
|
||||
#define mm512_rol1x16_64( v ) _mm512_rol_epi64( v, 16 )
|
||||
#define mm512_ror1x8_64( v ) _mm512_ror_epi64( v, 8 )
|
||||
#define mm512_rol1x8_64( v ) _mm512_rol_epi64( v, 8 )
|
||||
|
||||
/*
|
||||
#define mm512_ror1x16_64( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001c001f001e001d, 0x0018001b001a0019, \
|
||||
@@ -541,10 +539,16 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
0x2E2D2C2B2A29282F, 0x2625242322212027, \
|
||||
0x1E1D1C1B1A19181F, 0x1615141312111017, \
|
||||
0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
|
||||
*/
|
||||
|
||||
//
|
||||
// Rotate elements within 32 bit lanes.
|
||||
|
||||
#define mm512_swap16_32( v ) _mm512_ror_epi32( v, 16 )
|
||||
#define mm512_ror1x8_32( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_rol1x8_32( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
/*
|
||||
#define mm512_swap16_32( v ) \
|
||||
_mm512_permutexvar_epi16( m512_const_64( \
|
||||
0x001e001f001c001d, 0x001a001b00180019, \
|
||||
@@ -565,6 +569,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
0x2E2D2C2F2A29282B, 0x2625242722212023, \
|
||||
0x1E1D1C1F1A19181B, 0x1615141712111013, \
|
||||
0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
|
||||
*/
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements from 2 512 bit vectors in place, source arguments
|
||||
|
@@ -62,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
# GCC 9 doesn't include AES in core-avx2
|
||||
# GCC 9 doesn't include AES in -march=core-avx2
|
||||
CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
@@ -70,7 +70,8 @@ mv cpuminer.exe release/cpuminer-avx2.exe
|
||||
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $CONFIGURE_ARGS
|
||||
# -march=corei7-avx still includes aes, but just in case
|
||||
CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 16
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx.exe
|
||||
|
Reference in New Issue
Block a user