mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.15.2
This commit is contained in:
24
README.txt
24
README.txt
@@ -1,6 +1,10 @@
|
||||
This file is included in the Windows binary package. Compile instructions
|
||||
for Linux and Windows can be found in RELEASE_NOTES.
|
||||
|
||||
This package is officially avalable only from:
|
||||
https://github.com/JayDDee/cpuminer-opt
|
||||
No other sources should be trusted.
|
||||
|
||||
cpuminer is a console program that is executed from a DOS or Powershell
|
||||
prompt. There is no GUI and no mouse support.
|
||||
|
||||
@@ -31,20 +35,22 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
|
||||
https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
|
||||
|
||||
|
||||
Exe file name Compile flags Arch name
|
||||
Exe file name Compile flags Arch name
|
||||
|
||||
cpuminer-sse2.exe "-msse2" Core2, Nehalem
|
||||
cpuminer-aes-sse42.exe "-march=westmere" Westmere
|
||||
cpuminer-aes-sse42.exe "-marxh=westmere" Westmere
|
||||
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell*
|
||||
cpuminer-avx2.exe "-march=core-avx2 -maes" Haswell(1)
|
||||
cpuminer-avx512.exe "-march=skylake-avx512" Skylake-X, Cascadelake-X
|
||||
cpuminer-zen.exe "-march=znver1" AMD Ryzen, Threadripper
|
||||
cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake*
|
||||
cpuminer-zen.exe "-march=znver1" Zen1, Zen2
|
||||
cpuminer-zen3.exe "-march=znver2 -mvaes" Zen3(2)
|
||||
cpuminer-avx512-sha-vaes.exe "-march=icelake-client" Icelake(3)
|
||||
|
||||
* Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake.
|
||||
Icelake is only available on some laptops. Mining with a laptop is not
|
||||
recommended. The icelake build is included in anticipation of Intel eventually
|
||||
releasing a desktop CPU with a microarchitecture newer than Skylake.
|
||||
(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake.
|
||||
(2) Zen3 build uses Zen2+VAES as workaround until Zen3 compiler support is
|
||||
available. Zen2 CPUs should use Zen build.
|
||||
(3) Icelake is only available on some laptops. Mining with a laptop is not
|
||||
recommended.
|
||||
|
||||
Notes about included DLL files:
|
||||
|
||||
|
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.15.2
|
||||
|
||||
Zen3 AVX2+VAES optimization for x16*, x17, sonoa, xevan, x21s, x22i, x25x,
|
||||
allium.
|
||||
Zen3 build added to Windows binary package.
|
||||
|
||||
v3.15.1
|
||||
|
||||
Fix compile on AMD Zen3 CPUs with VAES.
|
||||
|
@@ -90,10 +90,11 @@ typedef uint32_t set_t;
|
||||
#define AES_OPT 2
|
||||
#define SSE42_OPT 4
|
||||
#define AVX_OPT 8 // Sandybridge
|
||||
#define AVX2_OPT 0x10 // Haswell
|
||||
#define SHA_OPT 0x20 // sha256 (Ryzen, Ice Lake)
|
||||
#define AVX512_OPT 0x40 // AVX512- F, VL, DQ, BW (Skylake-X)
|
||||
#define VAES_OPT 0x80 // VAES (Ice Lake)
|
||||
#define AVX2_OPT 0x10 // Haswell, Zen1
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (sha256)
|
||||
#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake (VAES & AVX512)
|
||||
#define VAES256_OPT 0x100 // Zen3 (VAES without AVX512)
|
||||
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
@@ -111,9 +112,9 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
|
||||
typedef struct
|
||||
{
|
||||
// Mandatory functions, one of these is mandatory. If a generic scanhash
|
||||
// is used a custom hash function must be registered, with a custom scanhash
|
||||
// the custom hash function can be called directly and doesn't need to be
|
||||
// registered in the gate.
|
||||
// is used a custom target hash function must be registered, with a custom
|
||||
// scanhash the target hash function can be called directly and doesn't need
|
||||
// to be registered in the gate.
|
||||
int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
|
||||
|
||||
int ( *hash ) ( void*, const void*, int );
|
||||
|
@@ -1,5 +1,4 @@
|
||||
//#if 0
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "echo-hash-4way.h"
|
||||
@@ -13,8 +12,12 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
*/
|
||||
// do these need to be reversed?
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
|
||||
#define mul2mask \
|
||||
_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
m512_const2_64( 0, 0x00001b00 )
|
||||
//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 )
|
||||
// _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )
|
||||
|
||||
#define lsbmask m512_const1_32( 0x01010101 )
|
||||
@@ -30,87 +33,87 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
|
||||
const int j2 = ( (j)+2 ) & 3; \
|
||||
const int j3 = ( (j)+3 ) & 3; \
|
||||
s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask );\
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ] [j ] = s2; \
|
||||
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
|
||||
s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask );\
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ] [j ] = s2; \
|
||||
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
|
||||
s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
|
||||
t1 = _mm512_and_si512( t1, lsbmask ); \
|
||||
t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
|
||||
s2 = _mm512_xor_si512( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
|
||||
_mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
@@ -224,43 +227,43 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
|
||||
|
||||
int echo_4way_init( echo_4way_context *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
int i, j;
|
||||
|
||||
ctx->k = m512_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
|
||||
break;
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
|
||||
break;
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m512_zero;
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m512_zero;
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
@@ -285,17 +288,13 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
|
||||
|
||||
remainingbits = m512_const2_64( 0, (uint64_t)databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
|
||||
state->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] =
|
||||
_mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
|
||||
state->buffer[ vblen-1 ] =
|
||||
_mm512_set4_epi64( 0, state->processed_bits,
|
||||
0, state->processed_bits );
|
||||
state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits);
|
||||
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
@@ -328,16 +327,16 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
|
||||
ctx->hashsize = m512_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
|
||||
ctx->hashsize = m512_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m512_const2_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -372,17 +371,14 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
|
||||
|
||||
remainingbits = m512_const2_64( 0, databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
|
||||
ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
|
||||
memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] =
|
||||
_mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
|
||||
ctx->buffer[ vblen-1 ] =
|
||||
_mm512_set4_epi64( 0, ctx->processed_bits,
|
||||
0, ctx->processed_bits );
|
||||
m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits);
|
||||
|
||||
ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
|
||||
@@ -400,5 +396,380 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#endif
|
||||
// AVX2 + VAES
|
||||
|
||||
#define mul2mask_2way m256_const2_64( 0, 0x0000000000001b00 )
|
||||
|
||||
#define lsbmask_2way m256_const1_32( 0x01010101 )
|
||||
|
||||
#define ECHO_SUBBYTES_2WAY( state, i, j ) \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
|
||||
state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
|
||||
k1 = _mm256_add_epi32( k1, m256_one_128 );
|
||||
|
||||
#define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
|
||||
{ \
|
||||
const int j1 = ( (j)+1 ) & 3; \
|
||||
const int j2 = ( (j)+2 ) & 3; \
|
||||
const int j3 = ( (j)+3 ) & 3; \
|
||||
s2 = _mm256_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 0 ][ j ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way );\
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ] [j ] = s2; \
|
||||
state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
|
||||
state2[ 3 ] [j ] = _mm256_xor_si256( s2, state1[ 0 ][ j ] );\
|
||||
s2 = _mm256_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
s2 = _mm256_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
t1 = _mm256_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
#define ECHO_ROUND_UNROLL2_2WAY \
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 0, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 1, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 2, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state, 3, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES_2WAY(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
#define SAVESTATE_2WAY(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
// blockcount always 1
|
||||
void echo_2way_compress( echo_2way_context *ctx, const __m256i *pmsg,
|
||||
unsigned int uBlockCount )
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m256i t1, t2, s2, k1;
|
||||
__m256i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
_state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
|
||||
_state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
|
||||
_state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
|
||||
_state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
|
||||
_state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
|
||||
_state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
|
||||
_state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
|
||||
_state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
|
||||
_state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
|
||||
_state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
|
||||
_state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
|
||||
_state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
|
||||
_state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
|
||||
_state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
|
||||
_state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
|
||||
_state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
|
||||
|
||||
for ( b = 0; b < uBlockCount; b++ )
|
||||
{
|
||||
ctx->k = _mm256_add_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
for( j = ctx->uHashSize / 256; j < 4; j++ )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ j ] = _mm256_load_si256(
|
||||
pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
|
||||
}
|
||||
}
|
||||
|
||||
// save state
|
||||
SAVESTATE_2WAY( _statebackup, _state );
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for ( r = 0; r < ctx->uRounds / 2; r++ )
|
||||
{
|
||||
ECHO_ROUND_UNROLL2_2WAY;
|
||||
}
|
||||
|
||||
if ( ctx->uHashSize == 256 )
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 2 ] ) ;
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_state[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
|
||||
_state[ i ][ 3 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
|
||||
_statebackup[ i ][ 0 ] );
|
||||
_state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ] [0 ],
|
||||
_statebackup[ i ][ 2 ] );
|
||||
_state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 1 ] );
|
||||
_state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
|
||||
_statebackup[ i ][ 3 ] );
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE_2WAY(ctx->state, _state);
|
||||
|
||||
}
|
||||
int echo_2way_init( echo_2way_context *ctx, int nHashSize )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = m256_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m256_zero;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_2way_update_close( echo_2way_context *state, void *hashval,
|
||||
const void *data, int databitlen )
|
||||
{
|
||||
// bytelen is either 32 (maybe), 64 or 80 or 128!
|
||||
// all are less than full block.
|
||||
|
||||
int vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
const int vblen = state->uBlockLength / 16; // 16 bytes per lane
|
||||
__m256i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_2way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy_256( state->buffer, data, vlen );
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 );
|
||||
state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits );
|
||||
|
||||
state->k = _mm256_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm256_sub_epi64( state->k, state->const1536 );
|
||||
|
||||
echo_2way_compress( state, state->buffer, 1 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)hashval + 0, state->state[ 0 ][ 0] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 1, state->state[ 1 ][ 0] );
|
||||
|
||||
if ( state->uHashSize == 512 )
|
||||
{
|
||||
_mm256_store_si256( (__m256i*)hashval + 2, state->state[ 2 ][ 0 ] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 3, state->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen )
|
||||
{
|
||||
int i, j;
|
||||
int databitlen = datalen * 8;
|
||||
ctx->k = m256_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x100 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = m256_const2_64( 0, 0x200 );
|
||||
ctx->const1536 = m256_const2_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m256_zero;
|
||||
|
||||
int vlen = datalen / 32;
|
||||
const int vblen = ctx->uBlockLength / 16; // 16 bytes per lane
|
||||
__m256i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_2way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m256_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_256( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = m256_const2_64( 0, databitlen );
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
|
||||
memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
|
||||
ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits );
|
||||
|
||||
ctx->k = _mm256_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
echo_2way_compress( ctx, ctx->buffer, 1 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)hashval + 0, ctx->state[ 0 ][ 0] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 1, ctx->state[ 1 ][ 0] );
|
||||
|
||||
if ( ctx->uHashSize == 512 )
|
||||
{
|
||||
_mm256_store_si256( (__m256i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
|
||||
_mm256_store_si256( (__m256i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif // VAES
|
||||
|
@@ -1,10 +1,12 @@
|
||||
#if !defined(ECHO_HASH_4WAY_H__)
|
||||
#define ECHO_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i state[4][4];
|
||||
@@ -20,6 +22,7 @@ typedef struct
|
||||
unsigned int processed_bits;
|
||||
|
||||
} echo_4way_context __attribute__ ((aligned (64)));
|
||||
#define echo512_4way_context echo_4way_context
|
||||
|
||||
int echo_4way_init( echo_4way_context *state, int hashbitlen );
|
||||
#define echo512_4way_init( state ) echo_4way_init( state, 512 )
|
||||
@@ -29,8 +32,8 @@ int echo_4way_update( echo_4way_context *state, const void *data,
|
||||
unsigned int databitlen);
|
||||
#define echo512_4way_update echo_4way_update
|
||||
|
||||
int echo_close( echo_4way_context *state, void *hashval );
|
||||
#define echo512_4way_close echo_4way_close
|
||||
// int echo_4way_close( echo_4way_context *state, void *hashval );
|
||||
// #define echo512_4way_close echo_4way_close
|
||||
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
@@ -43,5 +46,45 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
#define echo256_4way_full( state, hashval, data, datalen ) \
|
||||
echo_4way_full( state, hashval, 256, data, datalen )
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif // AVX512
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i state[4][4];
|
||||
__m256i buffer[ 4 * 192 / 16 ]; // 4x128 interleaved 192 bytes
|
||||
__m256i k;
|
||||
__m256i hashsize;
|
||||
__m256i const1536;
|
||||
|
||||
unsigned int uRounds;
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
unsigned int processed_bits;
|
||||
|
||||
} echo_2way_context __attribute__ ((aligned (64)));
|
||||
#define echo512_2way_context echo_2way_context
|
||||
|
||||
int echo_2way_init( echo_2way_context *state, int hashbitlen );
|
||||
#define echo512_2way_init( state ) echo_2way_init( state, 512 )
|
||||
#define echo256_2way_init( state ) echo_2way_init( state, 256 )
|
||||
|
||||
int echo_2way_update( echo_2way_context *state, const void *data,
|
||||
unsigned int databitlen);
|
||||
#define echo512_2way_update echo_2way_update
|
||||
|
||||
int echo_2way_update_close( echo_2way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
#define echo512_2way_update_close echo_2way_update_close
|
||||
|
||||
int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen );
|
||||
#define echo512_2way_full( state, hashval, data, datalen ) \
|
||||
echo_2way_full( state, hashval, 512, data, datalen )
|
||||
#define echo256_2way_full( state, hashval, data, datalen ) \
|
||||
echo_2way_full( state, hashval, 256, data, datalen )
|
||||
|
||||
|
||||
#endif // VAES
|
||||
|
||||
#endif // ECHO_HASH_4WAY_H__
|
||||
|
@@ -15,7 +15,9 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
|
||||
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
@@ -43,10 +45,10 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
}
|
||||
|
||||
int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
const void* input, uint64_t datalen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = 32 / 16; // bytes to __m128i
|
||||
const int len = (int)datalen >> 4;
|
||||
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
@@ -172,5 +174,161 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2 + VAES
|
||||
|
||||
int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_zero;
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
||||
const void* input, uint64_t datalen )
|
||||
{
|
||||
const int len = (int)datalen >> 4;
|
||||
const int hashlen_m128i = 32 >> 4; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_zero;
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512_2way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF512_2way( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512_2way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF512_2way( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
|
@@ -18,8 +18,8 @@
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#define LENGTH (256)
|
||||
|
||||
//#include "brg_endian.h"
|
||||
@@ -48,6 +48,8 @@
|
||||
|
||||
#define SIZE256 (SIZE_512/16)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
||||
__attribute__ ((aligned (64))) __m512i buffer[SIZE256];
|
||||
@@ -55,7 +57,7 @@ typedef struct {
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
int rem_ptr;
|
||||
int databitlen; // bits
|
||||
// int databitlen; // bits
|
||||
} groestl256_4way_context;
|
||||
|
||||
|
||||
@@ -74,5 +76,25 @@ int groestl256_4way_update_close( groestl256_4way_context*, void*,
|
||||
int groestl256_4way_full( groestl256_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif // AVX512
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m256i chaining[SIZE256];
|
||||
__attribute__ ((aligned (64))) __m256i buffer[SIZE256];
|
||||
int hashlen; // byte
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
int rem_ptr;
|
||||
// int databitlen; // bits
|
||||
} groestl256_2way_context;
|
||||
|
||||
int groestl256_2way_init( groestl256_2way_context*, uint64_t );
|
||||
|
||||
int groestl256_2way_update_close( groestl256_2way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
int groestl256_2way_full( groestl256_2way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL256_HASH_4WAY_H__
|
||||
|
@@ -12,7 +12,7 @@
|
||||
|
||||
#include "groestl256-hash-4way.h"
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
@@ -42,6 +42,8 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
0x2d2529212c242820, 0x2f272b232e262a22,
|
||||
@@ -499,5 +501,398 @@ void OF512_4way( __m512i* chaining )
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
static const __m256i TRANSP_MASK_2WAY =
|
||||
{ 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12 };
|
||||
|
||||
static const __m256i SUBSH_MASK0_2WAY =
|
||||
{ 0x0c0f0104070b0e00, 0x03060a0d08020509,
|
||||
0x1c1f1114171b1e10, 0x13161a1d18121519 };
|
||||
|
||||
static const __m256i SUBSH_MASK1_2WAY =
|
||||
{ 0x0e090205000d0801, 0x04070c0f0a03060b,
|
||||
0x1e191215101d1801, 0x14171c1f1a13161b };
|
||||
|
||||
static const __m256i SUBSH_MASK2_2WAY =
|
||||
{ 0x080b0306010f0a02, 0x05000e090c04070d,
|
||||
0x181b1316111f1a12, 0x15101e191c14171d };
|
||||
|
||||
static const __m256i SUBSH_MASK3_2WAY =
|
||||
{ 0x0a0d040702090c03, 0x0601080b0e05000f,
|
||||
0x1a1d141712191c13, 0x1611181b1e15101f };
|
||||
|
||||
static const __m256i SUBSH_MASK4_2WAY =
|
||||
{ 0x0b0e0500030a0d04, 0x0702090c0f060108,
|
||||
0x1b1e1510131a1d14, 0x1712191c1f161118 };
|
||||
|
||||
static const __m256i SUBSH_MASK5_2WAY =
|
||||
{ 0x0d080601040c0f05, 0x00030b0e0907020a,
|
||||
0x1d181611141c1f15, 0x10131b1e1917121a };
|
||||
|
||||
static const __m256i SUBSH_MASK6_2WAY =
|
||||
{ 0x0f0a0702050e0906, 0x01040d080b00030c,
|
||||
0x1f1a1712151e1916, 0x11141d181b10131c };
|
||||
|
||||
static const __m256i SUBSH_MASK7_2WAY =
|
||||
{ 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
0x191c101316181b17, 0x12151f1a1d11141e, };
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2_2WAY(i, j, k){\
|
||||
j = _mm256_xor_si256(j, j);\
|
||||
j = _mm256_cmpgt_epi8(j, i );\
|
||||
i = _mm256_add_epi8(i, i);\
|
||||
j = _mm256_and_si256(j, k);\
|
||||
i = _mm256_xor_si256(i, j);\
|
||||
}
|
||||
|
||||
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm256_xor_si256(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm256_xor_si256(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm256_xor_si256(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm256_xor_si256(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm256_xor_si256(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm256_xor_si256(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm256_xor_si256(a6, a7);\
|
||||
a7 = _mm256_xor_si256(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm256_xor_si256(b0, a4);\
|
||||
b6 = _mm256_xor_si256(b6, a4);\
|
||||
b1 = _mm256_xor_si256(b1, a5);\
|
||||
b7 = _mm256_xor_si256(b7, a5);\
|
||||
b2 = _mm256_xor_si256(b2, a6);\
|
||||
b0 = _mm256_xor_si256(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm256_xor_si256(b3, a7);\
|
||||
b1 = _mm256_xor_si256(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm256_xor_si256(b4, a0);\
|
||||
b2 = _mm256_xor_si256(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm256_xor_si256(b5, a1);\
|
||||
b3 = _mm256_xor_si256(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm256_xor_si256(b6, a2);\
|
||||
b4 = _mm256_xor_si256(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm256_xor_si256(b7, a3);\
|
||||
b5 = _mm256_xor_si256(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm256_xor_si256(a0, a3);\
|
||||
a1 = _mm256_xor_si256(a1, a4);\
|
||||
a2 = _mm256_xor_si256(a2, a5);\
|
||||
a3 = _mm256_xor_si256(a3, a6);\
|
||||
a4 = _mm256_xor_si256(a4, a7);\
|
||||
a5 = _mm256_xor_si256(a5, b0);\
|
||||
a6 = _mm256_xor_si256(a6, b1);\
|
||||
a7 = _mm256_xor_si256(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
a0 = _mm256_xor_si256(a0, TEMP0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
a1 = _mm256_xor_si256(a1, TEMP1);\
|
||||
MUL2_2WAY(a2, b0, b1);\
|
||||
a2 = _mm256_xor_si256(a2, b2);\
|
||||
MUL2_2WAY(a3, b0, b1);\
|
||||
a3 = _mm256_xor_si256(a3, b3);\
|
||||
MUL2_2WAY(a4, b0, b1);\
|
||||
a4 = _mm256_xor_si256(a4, b4);\
|
||||
MUL2_2WAY(a5, b0, b1);\
|
||||
a5 = _mm256_xor_si256(a5, b5);\
|
||||
MUL2_2WAY(a6, b0, b1);\
|
||||
a6 = _mm256_xor_si256(a6, b6);\
|
||||
MUL2_2WAY(a7, b0, b1);\
|
||||
a7 = _mm256_xor_si256(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
b5 = _mm256_xor_si256(b5, a0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
b6 = _mm256_xor_si256(b6, a1);\
|
||||
MUL2_2WAY(a2, b0, b1);\
|
||||
b7 = _mm256_xor_si256(b7, a2);\
|
||||
MUL2_2WAY(a5, b0, b1);\
|
||||
b2 = _mm256_xor_si256(b2, a5);\
|
||||
MUL2_2WAY(a6, b0, b1);\
|
||||
b3 = _mm256_xor_si256(b3, a6);\
|
||||
MUL2_2WAY(a7, b0, b1);\
|
||||
b4 = _mm256_xor_si256(b4, a7);\
|
||||
MUL2_2WAY(a3, b0, b1);\
|
||||
MUL2_2WAY(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm256_xor_si256(b0, a3);\
|
||||
b1 = _mm256_xor_si256(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m256_const2_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm256_xor_si256( a0, m256_const1_128( round_const_l0[i] ) );\
|
||||
a1 = _mm256_xor_si256( a1, b1 );\
|
||||
a2 = _mm256_xor_si256( a2, b1 );\
|
||||
a3 = _mm256_xor_si256( a3, b1 );\
|
||||
a4 = _mm256_xor_si256( a4, b1 );\
|
||||
a5 = _mm256_xor_si256( a5, b1 );\
|
||||
a6 = _mm256_xor_si256( a6, b1 );\
|
||||
a7 = _mm256_xor_si256( a7, m256_const1_128( round_const_l7[i] ) );\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm256_xor_si256( b0, b0 );\
|
||||
a0 = _mm256_shuffle_epi8( a0, SUBSH_MASK0_2WAY );\
|
||||
a0 = _mm256_aesenclast_epi128(a0, b0 );\
|
||||
a1 = _mm256_shuffle_epi8( a1, SUBSH_MASK1_2WAY );\
|
||||
a1 = _mm256_aesenclast_epi128(a1, b0 );\
|
||||
a2 = _mm256_shuffle_epi8( a2, SUBSH_MASK2_2WAY );\
|
||||
a2 = _mm256_aesenclast_epi128(a2, b0 );\
|
||||
a3 = _mm256_shuffle_epi8( a3, SUBSH_MASK3_2WAY );\
|
||||
a3 = _mm256_aesenclast_epi128(a3, b0 );\
|
||||
a4 = _mm256_shuffle_epi8( a4, SUBSH_MASK4_2WAY );\
|
||||
a4 = _mm256_aesenclast_epi128(a4, b0 );\
|
||||
a5 = _mm256_shuffle_epi8( a5, SUBSH_MASK5_2WAY );\
|
||||
a5 = _mm256_aesenclast_epi128(a5, b0 );\
|
||||
a6 = _mm256_shuffle_epi8( a6, SUBSH_MASK6_2WAY );\
|
||||
a6 = _mm256_aesenclast_epi128(a6, b0 );\
|
||||
a7 = _mm256_shuffle_epi8( a7, SUBSH_MASK7_2WAY );\
|
||||
a7 = _mm256_aesenclast_epi128( a7, b0 );\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q_2WAY(){\
|
||||
ROUND_2WAY(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND_2WAY(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND_2WAY(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND_2WAY(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND_2WAY(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND_2WAY(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND_2WAY(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND_2WAY(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND_2WAY(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND_2WAY(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
#define Matrix_Transpose_A_2way(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK_2WAY;\
|
||||
\
|
||||
i0 = _mm256_shuffle_epi8( i0, t0 );\
|
||||
i1 = _mm256_shuffle_epi8( i1, t0 );\
|
||||
i2 = _mm256_shuffle_epi8( i2, t0 );\
|
||||
i3 = _mm256_shuffle_epi8( i3, t0 );\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm256_unpacklo_epi16( i0, i1 );\
|
||||
o1 = _mm256_unpackhi_epi16( o1, i1 );\
|
||||
i2 = _mm256_unpacklo_epi16( i2, i3 );\
|
||||
t0 = _mm256_unpackhi_epi16( t0, i3 );\
|
||||
\
|
||||
i0 = _mm256_shuffle_epi32( i0, 216 );\
|
||||
o1 = _mm256_shuffle_epi32( o1, 216 );\
|
||||
i2 = _mm256_shuffle_epi32( i2, 216 );\
|
||||
t0 = _mm256_shuffle_epi32( t0, 216 );\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm256_unpacklo_epi32( i0, i2 );\
|
||||
o1 = _mm256_unpacklo_epi32( o1, t0 );\
|
||||
o2 = _mm256_unpackhi_epi32( o2, i2 );\
|
||||
o3 = _mm256_unpackhi_epi32( o3, t0 );\
|
||||
}/**/
|
||||
|
||||
#define Matrix_Transpose_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm256_unpacklo_epi64( i0, i4 );\
|
||||
o1 = _mm256_unpackhi_epi64( o1, i4 );\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm256_unpacklo_epi64( o2, i5 );\
|
||||
o3 = _mm256_unpackhi_epi64( o3, i5 );\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm256_unpacklo_epi64( o4, i6 );\
|
||||
o5 = _mm256_unpackhi_epi64( o5, i6 );\
|
||||
o7 = i3;\
|
||||
o6 = _mm256_unpacklo_epi64( o6, i7 );\
|
||||
o7 = _mm256_unpackhi_epi64( o7, i7 );\
|
||||
}/**/
|
||||
|
||||
#define Matrix_Transpose_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm256_unpacklo_epi64( i0, i1 );\
|
||||
o0 = _mm256_unpackhi_epi64( o0, i1 );\
|
||||
o1 = i2;\
|
||||
i2 = _mm256_unpacklo_epi64( i2, i3 );\
|
||||
o1 = _mm256_unpackhi_epi64( o1, i3 );\
|
||||
o2 = i4;\
|
||||
i4 = _mm256_unpacklo_epi64( i4, i5 );\
|
||||
o2 = _mm256_unpackhi_epi64( o2, i5 );\
|
||||
o3 = i6;\
|
||||
i6 = _mm256_unpacklo_epi64( i6, i7 );\
|
||||
o3 = _mm256_unpackhi_epi64( o3, i7 );\
|
||||
}/**/
|
||||
|
||||
#define Matrix_Transpose_O_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm256_xor_si256( t0, t0 );\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm256_unpacklo_epi64( i0, t0 );\
|
||||
i1 = _mm256_unpackhi_epi64( i1, t0 );\
|
||||
i2 = _mm256_unpacklo_epi64( i2, t0 );\
|
||||
i3 = _mm256_unpackhi_epi64( i3, t0 );\
|
||||
i4 = _mm256_unpacklo_epi64( i4, t0 );\
|
||||
i5 = _mm256_unpackhi_epi64( i5, t0 );\
|
||||
i6 = _mm256_unpacklo_epi64( i6, t0 );\
|
||||
i7 = _mm256_unpackhi_epi64( i7, t0 );\
|
||||
}/**/
|
||||
|
||||
#define Matrix_Transpose_O_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm256_unpacklo_epi64( i0, i1 );\
|
||||
i2 = _mm256_unpacklo_epi64( i2, i3 );\
|
||||
i4 = _mm256_unpacklo_epi64( i4, i5 );\
|
||||
i6 = _mm256_unpacklo_epi64( i6, i7 );\
|
||||
}/**/
|
||||
|
||||
void TF512_2way( __m256i* chaining, __m256i* message )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A_2way(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm256_xor_si256( xmm8, xmm12 );
|
||||
xmm0 = _mm256_xor_si256( xmm0, xmm2 );
|
||||
xmm4 = _mm256_xor_si256( xmm4, xmm6 );
|
||||
xmm5 = _mm256_xor_si256( xmm5, xmm7 );
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B_2way(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q_2WAY();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm256_xor_si256( xmm0, xmm8 );
|
||||
xmm1 = _mm256_xor_si256( xmm1, xmm10 );
|
||||
xmm2 = _mm256_xor_si256( xmm2, xmm12 );
|
||||
xmm3 = _mm256_xor_si256( xmm3, xmm14 );
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm256_xor_si256( xmm0, (chaining[0]) );
|
||||
xmm1 = _mm256_xor_si256( xmm1, (chaining[1]) );
|
||||
xmm2 = _mm256_xor_si256( xmm2, (chaining[2]) );
|
||||
xmm3 = _mm256_xor_si256( xmm3, (chaining[3]) );
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512_2way( __m256i* chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q_2WAY();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) );
|
||||
xmm10 = _mm256_xor_si256( xmm10, (chaining[1]) );
|
||||
xmm12 = _mm256_xor_si256( xmm12, (chaining[2]) );
|
||||
xmm14 = _mm256_xor_si256( xmm14, (chaining[3]) );
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A_2way(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL512_INTR_4WAY_H__
|
||||
#endif // GROESTL256_INTR_4WAY_H__
|
||||
|
@@ -15,7 +15,9 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
@@ -137,5 +139,130 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
// AVX2 + VAES
|
||||
|
||||
int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
memset_zero_256( ctx->chaining, SIZE512 );
|
||||
memset_zero_256( ctx->buffer, SIZE512 );
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE512;
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem;
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
}
|
||||
|
||||
TF1024_2way( ctx->chaining, ctx->buffer );
|
||||
OF1024_2way( ctx->chaining );
|
||||
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
|
||||
const void* input, uint64_t datalen )
|
||||
{
|
||||
const int len = (int)datalen >> 4;
|
||||
const int hashlen_m128i = 64 >> 4; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m256i* in = (__m256i*)input;
|
||||
int i;
|
||||
|
||||
// --- init ---
|
||||
|
||||
memset_zero_256( ctx->chaining, SIZE512 );
|
||||
memset_zero_256( ctx->buffer, SIZE512 );
|
||||
ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += ctx->rem_ptr;
|
||||
|
||||
// --- close ---
|
||||
|
||||
blocks++;
|
||||
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m256_const2_64( 0, 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m256_zero;
|
||||
ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
TF1024_2way( ctx->chaining, ctx->buffer );
|
||||
OF1024_2way( ctx->chaining );
|
||||
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
|
||||
|
@@ -10,7 +10,7 @@
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#define LENGTH (512)
|
||||
|
||||
@@ -36,20 +36,19 @@
|
||||
|
||||
#define SIZE512 (SIZE_1024/16)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||
__attribute__ ((aligned (64))) __m512i buffer[SIZE512];
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
int rem_ptr;
|
||||
int databitlen; // bits
|
||||
} groestl512_4way_context;
|
||||
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context*, uint64_t );
|
||||
|
||||
//int reinit_groestl( hashState_groestl* );
|
||||
|
||||
int groestl512_4way_update( groestl512_4way_context*, const void*,
|
||||
uint64_t );
|
||||
int groestl512_4way_close( groestl512_4way_context*, void* );
|
||||
@@ -58,5 +57,29 @@ int groestl512_4way_update_close( groestl512_4way_context*, void*,
|
||||
int groestl512_4way_full( groestl512_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2 + VAES
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m256i chaining[SIZE512];
|
||||
__attribute__ ((aligned (64))) __m256i buffer[SIZE512];
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
int rem_ptr;
|
||||
} groestl512_2way_context;
|
||||
|
||||
|
||||
int groestl512_2way_init( groestl512_2way_context*, uint64_t );
|
||||
|
||||
int groestl512_2way_update( groestl512_2way_context*, const void*,
|
||||
uint64_t );
|
||||
int groestl512_2way_close( groestl512_2way_context*, void* );
|
||||
int groestl512_2way_update_close( groestl512_2way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
int groestl512_2way_full( groestl512_2way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL512_HASH_4WAY_H__
|
||||
|
@@ -12,7 +12,7 @@
|
||||
|
||||
#include "groestl512-hash-4way.h"
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
@@ -50,6 +50,8 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||
};
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
0x2d2529212c242820, 0x2f272b232e262a22,
|
||||
@@ -660,5 +662,578 @@ void OF1024_4way( __m512i* chaining )
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2 + VAES
|
||||
|
||||
static const __m256i TRANSP_MASK_2WAY =
|
||||
{ 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12 };
|
||||
|
||||
static const __m256i SUBSH_MASK0_2WAY =
|
||||
{ 0x0b0e0104070a0d00, 0x0306090c0f020508,
|
||||
0x1b1e1114171a1d10, 0x1316191c1f121518 };
|
||||
|
||||
static const __m256i SUBSH_MASK1_2WAY =
|
||||
{ 0x0c0f0205080b0e01, 0x04070a0d00030609,
|
||||
0x1c1f1215181b1e11, 0x14171a1d10131619 };
|
||||
|
||||
static const __m256i SUBSH_MASK2_2WAY =
|
||||
{ 0x0d000306090c0f02, 0x05080b0e0104070a,
|
||||
0x1d101316191c1f12, 0x15181b1e1114171a };
|
||||
|
||||
static const __m256i SUBSH_MASK3_2WAY =
|
||||
{ 0x0e0104070a0d0003, 0x06090c0f0205080b,
|
||||
0x1e1114171a1d1013, 0x16191c1f1215181b };
|
||||
|
||||
static const __m256i SUBSH_MASK4_2WAY =
|
||||
{ 0x0f0205080b0e0104, 0x070a0d000306090c,
|
||||
0x1f1215181b1e1114, 0x171a1d101316191c };
|
||||
|
||||
static const __m256i SUBSH_MASK5_2WAY =
|
||||
{ 0x000306090c0f0205, 0x080b0e0104070a0d,
|
||||
0x101316191c1f1215, 0x181b1e1114171a1d };
|
||||
|
||||
static const __m256i SUBSH_MASK6_2WAY =
|
||||
{ 0x0104070a0d000306, 0x090c0f0205080b0e,
|
||||
0x1114171a1d101316, 0x191c1f1215181b1e };
|
||||
|
||||
static const __m256i SUBSH_MASK7_2WAY =
|
||||
{ 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
0x16191c1f1215181b, 0x1e1114171a1d1013 };
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2_2WAY(i, j, k){\
|
||||
j = _mm256_xor_si256(j, j);\
|
||||
j = _mm256_cmpgt_epi8(j, i );\
|
||||
i = _mm256_add_epi8(i, i);\
|
||||
j = _mm256_and_si256(j, k);\
|
||||
i = _mm256_xor_si256(i, j);\
|
||||
}
|
||||
|
||||
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm256_xor_si256(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm256_xor_si256(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm256_xor_si256(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm256_xor_si256(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm256_xor_si256(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm256_xor_si256(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm256_xor_si256(a6, a7);\
|
||||
a7 = _mm256_xor_si256(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm256_xor_si256(b0, a4);\
|
||||
b6 = _mm256_xor_si256(b6, a4);\
|
||||
b1 = _mm256_xor_si256(b1, a5);\
|
||||
b7 = _mm256_xor_si256(b7, a5);\
|
||||
b2 = _mm256_xor_si256(b2, a6);\
|
||||
b0 = _mm256_xor_si256(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm256_xor_si256(b3, a7);\
|
||||
b1 = _mm256_xor_si256(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm256_xor_si256(b4, a0);\
|
||||
b2 = _mm256_xor_si256(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm256_xor_si256(b5, a1);\
|
||||
b3 = _mm256_xor_si256(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm256_xor_si256(b6, a2);\
|
||||
b4 = _mm256_xor_si256(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm256_xor_si256(b7, a3);\
|
||||
b5 = _mm256_xor_si256(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm256_xor_si256(a0, a3);\
|
||||
a1 = _mm256_xor_si256(a1, a4);\
|
||||
a2 = _mm256_xor_si256(a2, a5);\
|
||||
a3 = _mm256_xor_si256(a3, a6);\
|
||||
a4 = _mm256_xor_si256(a4, a7);\
|
||||
a5 = _mm256_xor_si256(a5, b0);\
|
||||
a6 = _mm256_xor_si256(a6, b1);\
|
||||
a7 = _mm256_xor_si256(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
a0 = _mm256_xor_si256(a0, TEMP0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
a1 = _mm256_xor_si256(a1, TEMP1);\
|
||||
MUL2_2WAY(a2, b0, b1);\
|
||||
a2 = _mm256_xor_si256(a2, b2);\
|
||||
MUL2_2WAY(a3, b0, b1);\
|
||||
a3 = _mm256_xor_si256(a3, b3);\
|
||||
MUL2_2WAY(a4, b0, b1);\
|
||||
a4 = _mm256_xor_si256(a4, b4);\
|
||||
MUL2_2WAY(a5, b0, b1);\
|
||||
a5 = _mm256_xor_si256(a5, b5);\
|
||||
MUL2_2WAY(a6, b0, b1);\
|
||||
a6 = _mm256_xor_si256(a6, b6);\
|
||||
MUL2_2WAY(a7, b0, b1);\
|
||||
a7 = _mm256_xor_si256(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2_2WAY(a0, b0, b1);\
|
||||
b5 = _mm256_xor_si256(b5, a0);\
|
||||
MUL2_2WAY(a1, b0, b1);\
|
||||
b6 = _mm256_xor_si256(b6, a1);\
|
||||
MUL2_2WAY(a2, b0, b1);\
|
||||
b7 = _mm256_xor_si256(b7, a2);\
|
||||
MUL2_2WAY(a5, b0, b1);\
|
||||
b2 = _mm256_xor_si256(b2, a5);\
|
||||
MUL2_2WAY(a6, b0, b1);\
|
||||
b3 = _mm256_xor_si256(b3, a6);\
|
||||
MUL2_2WAY(a7, b0, b1);\
|
||||
b4 = _mm256_xor_si256(b4, a7);\
|
||||
MUL2_2WAY(a3, b0, b1);\
|
||||
MUL2_2WAY(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm256_xor_si256(b0, a3);\
|
||||
b1 = _mm256_xor_si256(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define SUBMIX_2WAY(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* SubBytes */\
|
||||
b0 = _mm256_xor_si256( b0, b0 );\
|
||||
a0 = _mm256_aesenclast_epi128( a0, b0 );\
|
||||
a1 = _mm256_aesenclast_epi128( a1, b0 );\
|
||||
a2 = _mm256_aesenclast_epi128( a2, b0 );\
|
||||
a3 = _mm256_aesenclast_epi128( a3, b0 );\
|
||||
a4 = _mm256_aesenclast_epi128( a4, b0 );\
|
||||
a5 = _mm256_aesenclast_epi128( a5, b0 );\
|
||||
a6 = _mm256_aesenclast_epi128( a6, b0 );\
|
||||
a7 = _mm256_aesenclast_epi128( a7, b0 );\
|
||||
/* MixBytes */\
|
||||
MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
#define ROUNDS_P_2WAY(){\
|
||||
uint8_t round_counter = 0;\
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm256_xor_si256( xmm8, m256_const1_128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK0_2WAY ); \
|
||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK1_2WAY );\
|
||||
xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK2_2WAY );\
|
||||
xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK3_2WAY );\
|
||||
xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK4_2WAY );\
|
||||
xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK5_2WAY );\
|
||||
xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK6_2WAY );\
|
||||
xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK7_2WAY );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm256_xor_si256( xmm0, m256_const1_128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
|
||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
|
||||
xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK2_2WAY );\
|
||||
xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK3_2WAY );\
|
||||
xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK4_2WAY );\
|
||||
xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK5_2WAY );\
|
||||
xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK6_2WAY );\
|
||||
xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK7_2WAY );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
#define ROUNDS_Q_2WAY(){\
|
||||
uint8_t round_counter = 0;\
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2) \
|
||||
{ \
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = m256_neg1;\
|
||||
xmm8 = _mm256_xor_si256( xmm8, xmm1 );\
|
||||
xmm9 = _mm256_xor_si256( xmm9, xmm1 );\
|
||||
xmm10 = _mm256_xor_si256( xmm10, xmm1 );\
|
||||
xmm11 = _mm256_xor_si256( xmm11, xmm1 );\
|
||||
xmm12 = _mm256_xor_si256( xmm12, xmm1 );\
|
||||
xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
|
||||
xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
|
||||
xmm15 = _mm256_xor_si256( xmm15, m256_const1_128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm256_shuffle_epi8( xmm8, SUBSH_MASK1_2WAY );\
|
||||
xmm9 = _mm256_shuffle_epi8( xmm9, SUBSH_MASK3_2WAY );\
|
||||
xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK5_2WAY );\
|
||||
xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK7_2WAY );\
|
||||
xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK0_2WAY );\
|
||||
xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK2_2WAY );\
|
||||
xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK4_2WAY );\
|
||||
xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK6_2WAY );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = m256_neg1;\
|
||||
xmm0 = _mm256_xor_si256( xmm0, xmm9 );\
|
||||
xmm1 = _mm256_xor_si256( xmm1, xmm9 );\
|
||||
xmm2 = _mm256_xor_si256( xmm2, xmm9 );\
|
||||
xmm3 = _mm256_xor_si256( xmm3, xmm9 );\
|
||||
xmm4 = _mm256_xor_si256( xmm4, xmm9 );\
|
||||
xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
|
||||
xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
|
||||
xmm7 = _mm256_xor_si256( xmm7, m256_const1_128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
|
||||
xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
|
||||
xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK5_2WAY );\
|
||||
xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK7_2WAY );\
|
||||
xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK0_2WAY );\
|
||||
xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK2_2WAY );\
|
||||
xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK4_2WAY );\
|
||||
xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK6_2WAY );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
#define Matrix_Transpose_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
|
||||
t0 = TRANSP_MASK_2WAY;\
|
||||
\
|
||||
i6 = _mm256_shuffle_epi8(i6, t0);\
|
||||
i0 = _mm256_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm256_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm256_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm256_shuffle_epi8(i3, t0);\
|
||||
t1 = i2;\
|
||||
i4 = _mm256_shuffle_epi8(i4, t0);\
|
||||
i5 = _mm256_shuffle_epi8(i5, t0);\
|
||||
t2 = i4;\
|
||||
t3 = i6;\
|
||||
i7 = _mm256_shuffle_epi8(i7, t0);\
|
||||
\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t0 = i0;\
|
||||
t2 = _mm256_unpackhi_epi16(t2, i5);\
|
||||
i4 = _mm256_unpacklo_epi16(i4, i5);\
|
||||
t3 = _mm256_unpackhi_epi16(t3, i7);\
|
||||
i6 = _mm256_unpacklo_epi16(i6, i7);\
|
||||
t0 = _mm256_unpackhi_epi16(t0, i1);\
|
||||
t1 = _mm256_unpackhi_epi16(t1, i3);\
|
||||
i2 = _mm256_unpacklo_epi16(i2, i3);\
|
||||
i0 = _mm256_unpacklo_epi16(i0, i1);\
|
||||
\
|
||||
/* shuffle with immediate */\
|
||||
t0 = _mm256_shuffle_epi32(t0, 216);\
|
||||
t1 = _mm256_shuffle_epi32(t1, 216);\
|
||||
t2 = _mm256_shuffle_epi32(t2, 216);\
|
||||
t3 = _mm256_shuffle_epi32(t3, 216);\
|
||||
i0 = _mm256_shuffle_epi32(i0, 216);\
|
||||
i2 = _mm256_shuffle_epi32(i2, 216);\
|
||||
i4 = _mm256_shuffle_epi32(i4, 216);\
|
||||
i6 = _mm256_shuffle_epi32(i6, 216);\
|
||||
\
|
||||
/* continue with unpack */\
|
||||
t4 = i0;\
|
||||
i0 = _mm256_unpacklo_epi32(i0, i2);\
|
||||
t4 = _mm256_unpackhi_epi32(t4, i2);\
|
||||
t5 = t0;\
|
||||
t0 = _mm256_unpacklo_epi32(t0, t1);\
|
||||
t5 = _mm256_unpackhi_epi32(t5, t1);\
|
||||
t6 = i4;\
|
||||
i4 = _mm256_unpacklo_epi32(i4, i6);\
|
||||
t7 = t2;\
|
||||
t6 = _mm256_unpackhi_epi32(t6, i6);\
|
||||
i2 = t0;\
|
||||
t2 = _mm256_unpacklo_epi32(t2, t3);\
|
||||
i3 = t0;\
|
||||
t7 = _mm256_unpackhi_epi32(t7, t3);\
|
||||
\
|
||||
/* there are now 2 rows in each xmm */\
|
||||
/* unpack to get 1 row of CV in each xmm */\
|
||||
i1 = i0;\
|
||||
i1 = _mm256_unpackhi_epi64(i1, i4);\
|
||||
i0 = _mm256_unpacklo_epi64(i0, i4);\
|
||||
i4 = t4;\
|
||||
i3 = _mm256_unpackhi_epi64(i3, t2);\
|
||||
i5 = t4;\
|
||||
i2 = _mm256_unpacklo_epi64(i2, t2);\
|
||||
i6 = t5;\
|
||||
i5 = _mm256_unpackhi_epi64(i5, t6);\
|
||||
i7 = t5;\
|
||||
i4 = _mm256_unpacklo_epi64(i4, t6);\
|
||||
i7 = _mm256_unpackhi_epi64(i7, t7);\
|
||||
i6 = _mm256_unpacklo_epi64(i6, t7);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
#define Matrix_Transpose_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
|
||||
/* transpose matrix to get output format */\
|
||||
o1 = i0;\
|
||||
i0 = _mm256_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm256_unpackhi_epi64(o1, i1);\
|
||||
t0 = i2;\
|
||||
i2 = _mm256_unpacklo_epi64(i2, i3);\
|
||||
t0 = _mm256_unpackhi_epi64(t0, i3);\
|
||||
t1 = i4;\
|
||||
i4 = _mm256_unpacklo_epi64(i4, i5);\
|
||||
t1 = _mm256_unpackhi_epi64(t1, i5);\
|
||||
t2 = i6;\
|
||||
o0 = TRANSP_MASK_2WAY;\
|
||||
i6 = _mm256_unpacklo_epi64(i6, i7);\
|
||||
t2 = _mm256_unpackhi_epi64(t2, i7);\
|
||||
/* load transpose mask into a register, because it will be used 8 times */\
|
||||
i0 = _mm256_shuffle_epi8(i0, o0);\
|
||||
i2 = _mm256_shuffle_epi8(i2, o0);\
|
||||
i4 = _mm256_shuffle_epi8(i4, o0);\
|
||||
i6 = _mm256_shuffle_epi8(i6, o0);\
|
||||
o1 = _mm256_shuffle_epi8(o1, o0);\
|
||||
t0 = _mm256_shuffle_epi8(t0, o0);\
|
||||
t1 = _mm256_shuffle_epi8(t1, o0);\
|
||||
t2 = _mm256_shuffle_epi8(t2, o0);\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t3 = i4;\
|
||||
o2 = o1;\
|
||||
o0 = i0;\
|
||||
t4 = t1;\
|
||||
\
|
||||
t3 = _mm256_unpackhi_epi16(t3, i6);\
|
||||
i4 = _mm256_unpacklo_epi16(i4, i6);\
|
||||
o0 = _mm256_unpackhi_epi16(o0, i2);\
|
||||
i0 = _mm256_unpacklo_epi16(i0, i2);\
|
||||
o2 = _mm256_unpackhi_epi16(o2, t0);\
|
||||
o1 = _mm256_unpacklo_epi16(o1, t0);\
|
||||
t4 = _mm256_unpackhi_epi16(t4, t2);\
|
||||
t1 = _mm256_unpacklo_epi16(t1, t2);\
|
||||
/* shuffle with immediate */\
|
||||
i4 = _mm256_shuffle_epi32(i4, 216);\
|
||||
t3 = _mm256_shuffle_epi32(t3, 216);\
|
||||
o1 = _mm256_shuffle_epi32(o1, 216);\
|
||||
o2 = _mm256_shuffle_epi32(o2, 216);\
|
||||
i0 = _mm256_shuffle_epi32(i0, 216);\
|
||||
o0 = _mm256_shuffle_epi32(o0, 216);\
|
||||
t1 = _mm256_shuffle_epi32(t1, 216);\
|
||||
t4 = _mm256_shuffle_epi32(t4, 216);\
|
||||
/* continue with unpack */\
|
||||
i1 = i0;\
|
||||
i3 = o0;\
|
||||
i5 = o1;\
|
||||
i7 = o2;\
|
||||
i0 = _mm256_unpacklo_epi32(i0, i4);\
|
||||
i1 = _mm256_unpackhi_epi32(i1, i4);\
|
||||
o0 = _mm256_unpacklo_epi32(o0, t3);\
|
||||
i3 = _mm256_unpackhi_epi32(i3, t3);\
|
||||
o1 = _mm256_unpacklo_epi32(o1, t1);\
|
||||
i5 = _mm256_unpackhi_epi32(i5, t1);\
|
||||
o2 = _mm256_unpacklo_epi32(o2, t4);\
|
||||
i7 = _mm256_unpackhi_epi32(i7, t4);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
void INIT_2way( __m256i *chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
}
|
||||
|
||||
void TF1024_2way( __m256i *chaining, const __m256i *message )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i QTEMP[8];
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
|
||||
/* load message into registers xmm8 - xmm15 (Q = message) */
|
||||
xmm8 = message[0];
|
||||
xmm9 = message[1];
|
||||
xmm10 = message[2];
|
||||
xmm11 = message[3];
|
||||
xmm12 = message[4];
|
||||
xmm13 = message[5];
|
||||
xmm14 = message[6];
|
||||
xmm15 = message[7];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
|
||||
/* store message M (Q input) for later */
|
||||
QTEMP[0] = xmm8;
|
||||
QTEMP[1] = xmm9;
|
||||
QTEMP[2] = xmm10;
|
||||
QTEMP[3] = xmm11;
|
||||
QTEMP[4] = xmm12;
|
||||
QTEMP[5] = xmm13;
|
||||
QTEMP[6] = xmm14;
|
||||
QTEMP[7] = xmm15;
|
||||
|
||||
/* xor CV to message to get P input */
|
||||
/* result: CV+M in xmm8...xmm15 */
|
||||
xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV+M) in xmm8...xmm15 */
|
||||
ROUNDS_P_2WAY();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV+M)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
|
||||
|
||||
/* store P(CV+M)+CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
/* load message M (Q input) into xmm8-15 */
|
||||
xmm8 = QTEMP[0];
|
||||
xmm9 = QTEMP[1];
|
||||
xmm10 = QTEMP[2];
|
||||
xmm11 = QTEMP[3];
|
||||
xmm12 = QTEMP[4];
|
||||
xmm13 = QTEMP[5];
|
||||
xmm14 = QTEMP[6];
|
||||
xmm15 = QTEMP[7];
|
||||
|
||||
/* compute permutation Q */
|
||||
/* result: Q(M) in xmm8...xmm15 */
|
||||
ROUNDS_Q_2WAY();
|
||||
|
||||
/* xor Q output */
|
||||
/* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
|
||||
xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm8;
|
||||
chaining[1] = xmm9;
|
||||
chaining[2] = xmm10;
|
||||
chaining[3] = xmm11;
|
||||
chaining[4] = xmm12;
|
||||
chaining[5] = xmm13;
|
||||
chaining[6] = xmm14;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF1024_2way( __m256i* chaining )
|
||||
{
|
||||
static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m256i TEMP0;
|
||||
static __m256i TEMP1;
|
||||
static __m256i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
xmm9 = chaining[1];
|
||||
xmm10 = chaining[2];
|
||||
xmm11 = chaining[3];
|
||||
xmm12 = chaining[4];
|
||||
xmm13 = chaining[5];
|
||||
xmm14 = chaining[6];
|
||||
xmm15 = chaining[7];
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV) in xmm8...xmm15 */
|
||||
ROUNDS_P_2WAY();
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm256_xor_si256( xmm8, (chaining[0]) );
|
||||
xmm9 = _mm256_xor_si256( xmm9, (chaining[1]) );
|
||||
xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
|
||||
xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
|
||||
xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
|
||||
xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
|
||||
xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
|
||||
xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
|
||||
|
||||
/* transpose CV back from row ordering to column ordering */
|
||||
/* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
|
||||
Matrix_Transpose_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[4] = xmm0;
|
||||
chaining[5] = xmm6;
|
||||
chaining[6] = xmm13;
|
||||
chaining[7] = xmm15;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL512_INTR_4WAY_H__
|
||||
|
@@ -174,24 +174,19 @@ void allium_16way_hash( void *state, const void *input )
|
||||
#if defined(__VAES__)
|
||||
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
|
||||
dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
|
||||
|
||||
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
|
||||
dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
|
||||
|
||||
intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
|
||||
dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
|
||||
dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
|
||||
|
||||
#else
|
||||
@@ -262,8 +257,11 @@ typedef struct {
|
||||
keccak256_4way_context keccak;
|
||||
cubehashParam cube;
|
||||
skein256_4way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_context groestl;
|
||||
#else
|
||||
hashState_groestl256 groestl;
|
||||
|
||||
#endif
|
||||
} allium_8way_ctx_holder;
|
||||
|
||||
static __thread allium_8way_ctx_holder allium_8way_ctx;
|
||||
@@ -273,7 +271,11 @@ bool init_allium_8way_ctx()
|
||||
keccak256_4way_init( &allium_8way_ctx.keccak );
|
||||
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &allium_8way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -352,9 +354,28 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
skein256_4way_update( &ctx.skein, vhashB, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhashB );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
|
||||
|
||||
rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
|
||||
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
|
||||
groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 );
|
||||
dintrlv_2x128( hash0, hash1, vhashC, 256 );
|
||||
dintrlv_2x128( hash2, hash3, vhashD, 256 );
|
||||
|
||||
rintrlv_4x64_2x128( vhashC, vhashD, vhashB, 256 );
|
||||
groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
|
||||
groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 );
|
||||
dintrlv_2x128( hash4, hash5, vhashC, 256 );
|
||||
dintrlv_2x128( hash6, hash7, vhashD, 256 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
|
||||
|
||||
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
|
||||
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
|
||||
groestl256_full( &ctx.groestl, hash2, hash2, 256 );
|
||||
@@ -363,6 +384,8 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
groestl256_full( &ctx.groestl, hash5, hash5, 256 );
|
||||
groestl256_full( &ctx.groestl, hash6, hash6, 256 );
|
||||
groestl256_full( &ctx.groestl, hash7, hash7, 256 );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
|
@@ -187,7 +187,8 @@ bool register_allium_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_allium;
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
|
||||
| VAES_OPT | VAES256_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -3,36 +3,38 @@
|
||||
bool register_sha256t_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(SHA256T_8WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||
gate->hash = (void*)&sha256t_8way_hash;
|
||||
#elif defined(SHA256T_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256t_4way;
|
||||
gate->hash = (void*)&sha256t_4way_hash;
|
||||
/*
|
||||
#else
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t;
|
||||
gate->hash = (void*)&sha256t_hash;
|
||||
*/
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_sha256q_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(SHA256T_8WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256q_8way;
|
||||
gate->hash = (void*)&sha256q_8way_hash;
|
||||
#elif defined(SHA256T_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256q_4way;
|
||||
gate->hash = (void*)&sha256q_4way_hash;
|
||||
/*
|
||||
#else
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256q;
|
||||
gate->hash = (void*)&sha256q_hash;
|
||||
*/
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
return true;
|
||||
|
||||
}
|
||||
|
@@ -4,13 +4,10 @@
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
// Override multi way on ryzen, SHA is better.
|
||||
#if !defined(__SHA__)
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX2__)
|
||||
#define SHA256T_8WAY
|
||||
#elif defined(__SSE2__)
|
||||
#else
|
||||
#define SHA256T_4WAY
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bool register_sha256t_algo( algo_gate_t* gate );
|
||||
@@ -36,12 +33,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
/*
|
||||
void sha256t_hash( void *output, const void *input );
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void sha256q_hash( void *output, const void *input );
|
||||
int scanhash_sha256q( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
@@ -1,5 +1,7 @@
|
||||
#include "sha256t-gate.h"
|
||||
|
||||
// Obsolete
|
||||
|
||||
#if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
|
@@ -26,7 +26,11 @@ static const uint32_t IV512[] =
|
||||
static void
|
||||
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
{
|
||||
#if defined(__VAES__)
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
#else
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
#endif
|
||||
__m256i p0, p1, p2, p3, x;
|
||||
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m256i *m = (__m256i*)msg;
|
||||
|
@@ -619,11 +619,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__VAES__)
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
#else
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
|
||||
break;
|
||||
#endif
|
||||
break;
|
||||
case JH:
|
||||
if ( i == 0 )
|
||||
jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
|
||||
@@ -711,11 +720,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
#if defined(__VAES__)
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
#else
|
||||
shavite512_full( &ctx.shavite, hash0, in0, size );
|
||||
shavite512_full( &ctx.shavite, hash1, in1, size );
|
||||
shavite512_full( &ctx.shavite, hash2, in2, size );
|
||||
shavite512_full( &ctx.shavite, hash3, in3, size );
|
||||
break;
|
||||
#endif
|
||||
break;
|
||||
case SIMD:
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
simd512_2way_full( &ctx.simd, vhash, vhash, size );
|
||||
@@ -725,6 +743,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__VAES__)
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
#else
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)in0, size );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
@@ -733,7 +759,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
|
||||
(const BitSequence *)in2, size );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)in3, size );
|
||||
break;
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
if ( i == 0 )
|
||||
hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
|
||||
|
@@ -61,7 +61,8 @@ bool register_x16r_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -79,7 +80,8 @@ bool register_x16rv2_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16rv2;
|
||||
gate->hash = (void*)&x16rv2_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -97,7 +99,8 @@ bool register_x16s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16r;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -230,7 +233,8 @@ bool register_x16rt_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -247,7 +251,8 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x16rt;
|
||||
gate->hash = (void*)&x16r_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
gate->build_extraheader = (void*)&veil_build_extraheader;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -277,22 +282,17 @@ bool register_x21s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x21s_8way;
|
||||
gate->hash = (void*)&x21s_8way_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_8way_thread_init;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
|
||||
| VAES_OPT;
|
||||
#elif defined (X16R_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x21s_4way;
|
||||
gate->hash = (void*)&x21s_4way_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_4way_thread_init;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
| AVX512_OPT | VAES_OPT;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x21s;
|
||||
gate->hash = (void*)&x21s_hash;
|
||||
gate->miner_thread_init = (void*)&x21s_thread_init;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
| AVX512_OPT | VAES_OPT;
|
||||
#endif
|
||||
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
|
@@ -41,6 +41,7 @@
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
#include "algo/echo/echo-hash-4way.h"
|
||||
#endif
|
||||
@@ -145,15 +146,21 @@ union _x16r_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_echo echo;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
shavite512_2way_context shavite;
|
||||
echo_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
hashState_luffa luffa1;
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
|
@@ -672,14 +672,20 @@ union _x16rv2_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_echo echo;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
shavite512_2way_context shavite;
|
||||
echo_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
shavite512_context shavite;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
@@ -745,10 +751,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
break;
|
||||
case GROESTL:
|
||||
#if defined(__VAES__)
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
#else
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
|
||||
#endif
|
||||
break;
|
||||
case JH:
|
||||
if ( i == 0 )
|
||||
@@ -887,10 +902,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
}
|
||||
break;
|
||||
case SHAVITE:
|
||||
#if defined(__VAES__)
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
#else
|
||||
shavite512_full( &ctx.shavite, hash0, in0, size );
|
||||
shavite512_full( &ctx.shavite, hash1, in1, size );
|
||||
shavite512_full( &ctx.shavite, hash2, in2, size );
|
||||
shavite512_full( &ctx.shavite, hash3, in3, size );
|
||||
#endif
|
||||
break;
|
||||
case SIMD:
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
@@ -901,6 +925,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
break;
|
||||
case ECHO:
|
||||
#if defined(__VAES__)
|
||||
intrlv_2x128( vhash, in0, in1, size<<3 );
|
||||
echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash0, hash1, vhash );
|
||||
intrlv_2x128( vhash, in2, in3, size<<3 );
|
||||
echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
|
||||
dintrlv_2x128_512( hash2, hash3, vhash );
|
||||
#else
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
(const BitSequence *)in0, size );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
|
||||
@@ -909,6 +941,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
|
||||
(const BitSequence *)in2, size );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)in3, size );
|
||||
#endif
|
||||
break;
|
||||
case HAMSI:
|
||||
if ( i == 0 )
|
||||
|
@@ -1124,7 +1124,13 @@ union _sonoa_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
echo512_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
@@ -1132,7 +1138,6 @@ union _sonoa_4way_context_overlay
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_4way_context shabal;
|
||||
@@ -1162,6 +1167,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -1171,6 +1187,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -1195,6 +1213,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -1206,16 +1233,29 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
(const BitSequence *)hash2, 64 );
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, 64 );
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
if ( work_restart[thr_id].restart ) return 0;
|
||||
// 2
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -1225,6 +1265,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -1249,6 +1291,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -1263,6 +1314,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
@@ -1274,6 +1327,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -1283,6 +1347,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -1307,6 +1373,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -1321,6 +1396,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
@@ -1340,6 +1417,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -1349,6 +1437,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -1373,6 +1463,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -1387,6 +1486,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
@@ -1410,6 +1511,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
|
||||
@@ -1424,6 +1534,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
intrlv_2x128_512( vhashA, hash0, hash1 );
|
||||
intrlv_2x128_512( vhashB, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_init( &ctx.shavite );
|
||||
@@ -1443,6 +1555,20 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
shabal512_4way_update( &ctx.shabal, vhashB, 64 );
|
||||
shabal512_4way_close( &ctx.shabal, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
// rintrlv_4x32_2x128( vhashA, vhashB, vhash, 512 );
|
||||
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
|
||||
intrlv_2x128_512( vhashA, hash0, hash1 );
|
||||
intrlv_2x128_512( vhashB, hash2, hash3 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -1452,6 +1578,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -1476,6 +1604,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -1490,6 +1627,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
@@ -1523,6 +1662,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -1532,6 +1682,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -1556,6 +1708,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -1570,6 +1731,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
@@ -1616,6 +1779,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -1625,6 +1799,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -1649,6 +1825,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -1663,6 +1848,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
@@ -12,7 +12,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
|
||||
init_sonoa_ctx();
|
||||
gate->hash = (void*)&sonoa_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -240,7 +240,13 @@ union _x17_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
echo512_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
@@ -248,7 +254,6 @@ union _x17_4way_context_overlay
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_4way_context shabal;
|
||||
@@ -275,6 +280,17 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
@@ -284,6 +300,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -308,6 +326,15 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -322,6 +349,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
|
||||
#else
|
||||
gate->hash = (void*)&x17_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -405,15 +405,20 @@ union _xevan_4way_context_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
hashState_groestl groestl;
|
||||
skein512_4way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
echo_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
hashState_echo echo;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_4way_context shabal;
|
||||
@@ -442,7 +447,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
// Serial
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
|
||||
@@ -450,9 +465,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
|
||||
|
||||
// Parallel 4way
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -477,6 +493,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
|
||||
|
||||
@@ -489,9 +514,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
|
||||
(const BitSequence *)hash3, dataLen );
|
||||
|
||||
// Parallel
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
@@ -542,6 +568,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
bmw512_4way_update( &ctx.bmw, vhash, dataLen );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
|
||||
@@ -551,6 +588,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
|
||||
|
||||
jh512_4way_init( &ctx.jh );
|
||||
@@ -575,6 +614,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
|
||||
dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
|
||||
|
||||
@@ -589,6 +637,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
|
||||
|
||||
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
|
||||
|
||||
#endif
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
|
||||
hamsi512_4way_close( &ctx.hamsi, vhash );
|
||||
|
@@ -12,7 +12,7 @@ bool register_xevan_algo( algo_gate_t* gate )
|
||||
init_xevan_ctx();
|
||||
gate->hash = (void*)&xevan_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
@@ -11,7 +11,7 @@
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/hamsi/hamsi-hash-4way.h"
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
@@ -494,14 +494,19 @@ union _x22i_4way_ctx_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
echo_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
shavite512_2way_context shavite;
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
@@ -535,14 +540,28 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
|
||||
bmw512_4way_init( &ctx.bmw );
|
||||
bmw512_4way_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (const char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (const char*)hash1, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (const char*)hash2, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (const char*)hash3, 512 );
|
||||
#if defined(__VAES__)
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
|
||||
@@ -570,6 +589,15 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
dintrlv_2x128_512( hash0, hash1, vhashA );
|
||||
dintrlv_2x128_512( hash2, hash3, vhashB );
|
||||
|
||||
@@ -584,6 +612,8 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
|
||||
|
||||
#endif
|
||||
|
||||
if ( work_restart[thrid].restart ) return false;
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
|
@@ -20,7 +20,7 @@ bool register_x22i_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x22i;
|
||||
gate->hash = (void*)&x22i_hash;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
| AVX512_OPT | VAES_OPT;
|
||||
| AVX512_OPT | VAES_OPT | VAES256_OPT;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
@@ -30,20 +30,15 @@ bool register_x25x_algo( algo_gate_t* gate )
|
||||
#if defined (X25X_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_x25x_8way;
|
||||
gate->hash = (void*)&x25x_8way_hash;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT
|
||||
| AVX512_OPT | VAES_OPT;
|
||||
#elif defined (X25X_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_x25x_4way;
|
||||
gate->hash = (void*)&x25x_4way_hash;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
| AVX512_OPT | VAES_OPT;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_x25x;
|
||||
gate->hash = (void*)&x25x_hash;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
|
||||
| AVX512_OPT | VAES_OPT;
|
||||
#endif
|
||||
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
|
||||
VAES_OPT | VAES256_OPT;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
@@ -15,6 +15,7 @@
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/luffa/luffa-hash-2way.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/shavite/shavite-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
@@ -412,9 +413,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
|
||||
LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
|
||||
dintrlv_2x256( hash6[19], hash7[19], vhash, 256 );
|
||||
|
||||
sph_gost512_init(&ctx.gost);
|
||||
sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
|
||||
sph_gost512_close(&ctx.gost, (void*) hash0[20]);
|
||||
sph_gost512_init(&ctx.gost);
|
||||
sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
|
||||
sph_gost512_close(&ctx.gost, (void*) hash0[20]);
|
||||
sph_gost512_init(&ctx.gost);
|
||||
sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
|
||||
sph_gost512_close(&ctx.gost, (void*) hash1[20]);
|
||||
@@ -574,68 +575,26 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_x25x_8way( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x08ff;
|
||||
|
||||
InitializeSWIFFTX();
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
do
|
||||
{
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
x25x_8way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
#elif defined(X25X_4WAY)
|
||||
|
||||
union _x25x_4way_ctx_overlay
|
||||
{
|
||||
blake512_4way_context blake;
|
||||
bmw512_4way_context bmw;
|
||||
#if defined(__VAES__)
|
||||
groestl512_2way_context groestl;
|
||||
echo_2way_context echo;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
#endif
|
||||
skein512_4way_context skein;
|
||||
jh512_4way_context jh;
|
||||
keccak512_4way_context keccak;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
luffa_2way_context luffa;
|
||||
cube_2way_context cube;
|
||||
shavite512_2way_context shavite;
|
||||
simd_2way_context simd;
|
||||
hamsi512_4way_context hamsi;
|
||||
hashState_fugue fugue;
|
||||
shabal512_4way_context shabal;
|
||||
@@ -658,6 +617,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
|
||||
unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
|
||||
unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
|
||||
unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
|
||||
x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake512_4way_full( &ctx.blake, vhash, input, 80 );
|
||||
@@ -668,11 +629,25 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
|
||||
bmw512_4way_close( &ctx.bmw, vhash );
|
||||
dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
|
||||
groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
|
||||
|
||||
dintrlv_2x128_512( hash0[2], hash1[2], vhashA );
|
||||
dintrlv_2x128_512( hash2[2], hash3[2], vhashB );
|
||||
|
||||
#else
|
||||
|
||||
groestl512_full( &ctx.groestl, (char*)hash0[2], (const char*)hash0[1], 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash1[2], (const char*)hash1[1], 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash2[2], (const char*)hash2[1], 512 );
|
||||
groestl512_full( &ctx.groestl, (char*)hash3[2], (const char*)hash3[1], 512 );
|
||||
|
||||
#endif
|
||||
|
||||
intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
|
||||
skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
|
||||
dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );
|
||||
@@ -689,41 +664,38 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
|
||||
keccak512_4way_close( &ctx.keccak, vhash );
|
||||
dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
|
||||
|
||||
luffa_full( &ctx.luffa, (BitSequence*)hash0[6], 512,
|
||||
(const BitSequence*)hash0[5], 64 );
|
||||
luffa_full( &ctx.luffa, (BitSequence*)hash1[6], 512,
|
||||
(const BitSequence*)hash1[5], 64 );
|
||||
luffa_full( &ctx.luffa, (BitSequence*)hash2[6], 512,
|
||||
(const BitSequence*)hash2[5], 64 );
|
||||
luffa_full( &ctx.luffa, (BitSequence*)hash3[6], 512,
|
||||
(const BitSequence*)hash3[5], 64 );
|
||||
rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
|
||||
|
||||
cubehash_full( &ctx.cube, (byte*)hash0[7], 512, (const byte*)hash0[6], 64 );
|
||||
cubehash_full( &ctx.cube, (byte*)hash1[7], 512, (const byte*)hash1[6], 64 );
|
||||
cubehash_full( &ctx.cube, (byte*)hash2[7], 512, (const byte*)hash2[6], 64 );
|
||||
cubehash_full( &ctx.cube, (byte*)hash3[7], 512, (const byte*)hash3[6], 64 );
|
||||
luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
|
||||
luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );
|
||||
dintrlv_2x128_512( hash0[6], hash1[6], vhashA );
|
||||
dintrlv_2x128_512( hash2[6], hash3[6], vhashB );
|
||||
|
||||
sph_shavite512_init(&ctx.shavite);
|
||||
sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash0[8]);
|
||||
sph_shavite512_init(&ctx.shavite);
|
||||
sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash1[8]);
|
||||
sph_shavite512_init(&ctx.shavite);
|
||||
sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash2[8]);
|
||||
sph_shavite512_init(&ctx.shavite);
|
||||
sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
|
||||
sph_shavite512_close(&ctx.shavite, hash3[8]);
|
||||
cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
|
||||
cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
|
||||
dintrlv_2x128_512( hash0[7], hash1[7], vhashA );
|
||||
dintrlv_2x128_512( hash2[7], hash3[7], vhashB );
|
||||
|
||||
simd_full( &ctx.simd, (BitSequence*)hash0[9],
|
||||
(const BitSequence*)hash0[8], 512 );
|
||||
simd_full( &ctx.simd, (BitSequence*)hash1[9],
|
||||
(const BitSequence*)hash1[8], 512 );
|
||||
simd_full( &ctx.simd, (BitSequence*)hash2[9],
|
||||
(const BitSequence*)hash2[8], 512 );
|
||||
simd_full( &ctx.simd, (BitSequence*)hash3[9],
|
||||
(const BitSequence*)hash3[8], 512 );
|
||||
shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
|
||||
shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );
|
||||
dintrlv_2x128_512( hash0[8], hash1[8], vhashA );
|
||||
dintrlv_2x128_512( hash2[8], hash3[8], vhashB );
|
||||
|
||||
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
|
||||
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
|
||||
dintrlv_2x128_512( hash0[9], hash1[9], vhashA );
|
||||
dintrlv_2x128_512( hash2[9], hash3[9], vhashB );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
|
||||
echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
|
||||
dintrlv_2x128_512( hash0[10], hash1[10], vhashA );
|
||||
dintrlv_2x128_512( hash2[10], hash3[10], vhashB );
|
||||
|
||||
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
|
||||
|
||||
#else
|
||||
|
||||
echo_full( &ctx.echo, (BitSequence *)hash0[10], 512,
|
||||
(const BitSequence *)hash0[ 9], 64 );
|
||||
@@ -736,6 +708,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
|
||||
|
||||
intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );
|
||||
|
||||
#endif
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
hamsi512_4way_init( &ctx.hamsi );
|
||||
|
@@ -4,8 +4,9 @@
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 > /dev/null
|
||||
|
||||
# Icelake AVX512 SHA VAES
|
||||
make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
@@ -16,6 +17,20 @@ mv cpuminer.exe cpuminer-avx512-sha-vaes.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512-sha-vaes
|
||||
|
||||
# Rocketlake AVX512 AES SHA
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl
|
||||
# CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-avx512-sha.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512-sha
|
||||
|
||||
# Slylake-X AVX512 AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-common" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
@@ -23,6 +38,7 @@ mv cpuminer.exe cpuminer-avx512.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512
|
||||
|
||||
# Haswell AVX2 AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
# GCC 9 doesn't include AES with core-avx2
|
||||
@@ -33,6 +49,7 @@ mv cpuminer.exe cpuminer-avx2.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx2
|
||||
|
||||
# Sandybridge AVX AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl
|
||||
@@ -42,15 +59,17 @@ mv cpuminer.exe cpuminer-avx.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx
|
||||
|
||||
# Westmere SSE4.2 AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -maes -msse4.2 -Wall -fno-common" ./configure --with-curl
|
||||
CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe cpuminer-aes-sse42.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-aes-sse42
|
||||
|
||||
# Nehalem SSE4.2
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl
|
||||
@@ -60,6 +79,7 @@ mv cpuminer.exe cpuminer-sse42.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-sse42
|
||||
|
||||
# Core2 SSSE3
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl
|
||||
@@ -69,6 +89,7 @@ mv cpuminer.exe cpuminer-ssse3.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-ssse3
|
||||
|
||||
# Generic SSE2
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl
|
||||
@@ -78,6 +99,7 @@ mv cpuminer.exe cpuminer-sse2.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-sse2
|
||||
|
||||
# Zen1 AVX2 SHA
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=znver1 -Wall -fno-common" ./configure --with-curl
|
||||
@@ -87,6 +109,7 @@ mv cpuminer.exe cpuminer-zen.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-zen
|
||||
|
||||
# Zen3 AVX2 SHA VAES
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl
|
||||
@@ -97,6 +120,7 @@ mv cpuminer.exe cpuminer-zen3.exe
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-zen3
|
||||
|
||||
# Native to current CPU
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
|
||||
|
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
#if [ "$OS" = "Windows_NT" ]; then
|
||||
# ./mingw64.sh
|
||||
# exit 0
|
||||
#fi
|
||||
|
||||
# Linux build
|
||||
|
||||
make distclean || echo clean
|
||||
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
# Ubuntu 10.04 (gcc 4.4)
|
||||
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
|
||||
|
||||
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
|
||||
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
|
||||
|
||||
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
|
||||
CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
|
||||
make -j 4
|
||||
|
||||
strip -s cpuminer
|
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
#if [ "$OS" = "Windows_NT" ]; then
|
||||
# ./mingw64.sh
|
||||
# exit 0
|
||||
#fi
|
||||
|
||||
# Linux build
|
||||
|
||||
make distclean || echo clean
|
||||
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
# Ubuntu 10.04 (gcc 4.4)
|
||||
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
|
||||
|
||||
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
|
||||
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
|
||||
|
||||
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
|
||||
CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
|
||||
make -j 4
|
||||
|
||||
strip -s cpuminer
|
7
build.sh
7
build.sh
@@ -12,15 +12,8 @@ make distclean || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
# Ubuntu 10.04 (gcc 4.4)
|
||||
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
|
||||
|
||||
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
|
||||
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
|
||||
|
||||
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
|
||||
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
|
||||
make -j 4
|
||||
|
||||
|
27
buildjdd.sh
27
buildjdd.sh
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
#if [ "$OS" = "Windows_NT" ]; then
|
||||
# ./mingw64.sh
|
||||
# exit 0
|
||||
#fi
|
||||
|
||||
# Linux build
|
||||
|
||||
make distclean || echo clean
|
||||
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
# Ubuntu 10.04 (gcc 4.4)
|
||||
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
|
||||
|
||||
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
|
||||
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
|
||||
|
||||
CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
|
||||
|
||||
make -j 4
|
||||
|
||||
strip -s cpuminer
|
@@ -2,8 +2,8 @@
|
||||
#
|
||||
# make clean and rm all the targetted executables.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null
|
||||
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null
|
||||
|
||||
make distclean > /dev/null
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.1.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.2.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.15.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.15.1'
|
||||
PACKAGE_VERSION='3.15.2'
|
||||
PACKAGE_STRING='cpuminer-opt 3.15.2'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.15.1 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.15.2 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.15.1:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.15.2:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.15.1
|
||||
cpuminer-opt configure 3.15.2
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.15.1, which was
|
||||
It was created by cpuminer-opt $as_me 3.15.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.15.1'
|
||||
VERSION='3.15.2'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.15.1, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.15.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6756,7 +6756,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.15.1
|
||||
cpuminer-opt config.status 3.15.2
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.15.1])
|
||||
AC_INIT([cpuminer-opt], [3.15.2])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
18
cpu-miner.c
18
cpu-miner.c
@@ -3383,13 +3383,14 @@ bool check_cpu_capability ()
|
||||
bool sw_has_sha = false;
|
||||
bool sw_has_vaes = false;
|
||||
set_t algo_features = algo_gate.optimizations;
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
|
||||
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
|
||||
bool algo_has_aes = set_incl( AES_OPT, algo_features );
|
||||
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
|
||||
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
|
||||
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
|
||||
bool algo_has_sha = set_incl( SHA_OPT, algo_features );
|
||||
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
|
||||
bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
|
||||
bool use_aes;
|
||||
bool use_sse2;
|
||||
bool use_sse42;
|
||||
@@ -3510,7 +3511,8 @@ bool check_cpu_capability ()
|
||||
use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
|
||||
use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
|
||||
use_sha = cpu_has_sha && sw_has_sha && algo_has_sha;
|
||||
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes && use_avx512;
|
||||
use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes
|
||||
&& ( use_avx512 || algo_has_vaes256 );
|
||||
use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
|
||||
use_sha || use_vaes );
|
||||
|
||||
|
@@ -143,8 +143,8 @@ do { \
|
||||
// Parallel AES, for when x is expected to be in a 256 bit register.
|
||||
// Use same 128 bit key.
|
||||
|
||||
//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if 0
|
||||
#if defined(__VAES__)
|
||||
|
||||
#define mm256_aesenc_2x128( x, k ) \
|
||||
_mm256_aesenc_epi128( x, k )
|
||||
|
||||
|
12
sysinfos.c
12
sysinfos.c
@@ -483,11 +483,13 @@ static inline bool has_avx512()
|
||||
// AMD Zen3 added support for 256 bit VAES without requiring AVX512.
|
||||
// The original Intel spec requires AVX512F to support 512 bit VAES and
|
||||
// requires AVX512VL to support 256 bit VAES.
|
||||
// cpuminer-opt only uses VAES512, simply testing the VAES bit is sufficient.
|
||||
// However, proper detection of VAES512 and VAES256 requires more work:
|
||||
// VAES512 = VAES && AVX512F (may not support VAES256)
|
||||
// VAES256 = AVX512VL ? VAES : ( AVX && VAES ) (may not support VAES512)
|
||||
// VAES = VAES && AVX512F && AVX512VL (supports both)
|
||||
// The CPUID VAES bit alone can't distiguish 256 vs 512 bit.
|
||||
// If necessary:
|
||||
// VAES 256 & 512 = VAES && AVX512VL
|
||||
// VAES 512 = VAES && AVX512F
|
||||
// VAES 256 = ( VAES && AVX512VL ) || ( VAES && !AVX512F )
|
||||
// VAES 512 only = VAES && AVX512F && !AVX512VL
|
||||
// VAES 256 only = VAES && !AVX512F
|
||||
|
||||
static inline bool has_vaes()
|
||||
{
|
||||
|
@@ -40,6 +40,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
|
||||
|
||||
# Start building...
|
||||
|
||||
# Icelake AVX512 SHA VAES
|
||||
./clean-all.sh || echo clean
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
@@ -48,6 +49,7 @@ make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
|
||||
|
||||
# Zen1 AVX2 SHA
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
|
||||
@@ -55,6 +57,16 @@ make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-zen.exe
|
||||
|
||||
# Zen3 AVX2 SHA VAES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS
|
||||
# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-zen3.exe
|
||||
|
||||
# Slylake-X AVX512 AES
|
||||
# mingw won't compile avx512 without -fno-asynchronous-unwind-tables
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
@@ -64,6 +76,7 @@ make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx512.exe
|
||||
|
||||
# Haswell AVX2 AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
# GCC 9 doesn't include AES in -march=core-avx2
|
||||
@@ -72,6 +85,7 @@ make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx2.exe
|
||||
|
||||
# Sandybridge AVX AES
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
# -march=corei7-avx still includes aes, but just in case
|
||||
@@ -80,6 +94,7 @@ make -j 8
|
||||
strip -s cpuminer.exe
|
||||
mv cpuminer.exe release/cpuminer-avx.exe
|
||||
|
||||
# Westmere SSE4.2 AES
|
||||
# -march=westmere is supported in gcc5
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
@@ -104,6 +119,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
|
||||
#mv cpuminer.exe release/cpuminer-ssse3.exe
|
||||
#make clean || echo clean
|
||||
|
||||
# Generic SSE2
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS
|
||||
|
Reference in New Issue
Block a user