This commit is contained in:
Jay D Dee
2021-12-30 16:28:24 -05:00
parent 7d2ef7973d
commit 0e3945ddb5
8 changed files with 155 additions and 49 deletions

View File

@@ -37,12 +37,23 @@ typedef struct
} hashState_fugue __attribute__ ((aligned (64)));
// These functions are deprecated, use the lower case macro aliases that use
// the standard interface. This will be cleaned up at a later date.
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
#define fugue512_init( state ) \
fugue512_Init( state, 512 )
#define fugue512_update( state, data, len ) \
fugue512_Update( state, data, (len)<<3 )
#define fugue512_final \
fugue512_Final
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
#endif // AES

View File

@@ -550,16 +550,38 @@ static const sph_u32 T512[64][16] = {
// Hamsi 8 way AVX512
// Tested on i9-9940x movepi64_mask is slow, cmple_epi64_mask with zero
// produces the same result but is faster.
#define INPUT_BIG8 \
do { \
__m512i db = *buf; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
dm = mm512_negate_32( _mm512_or_si512( dm, \
_mm512_slli_epi64( dm, 32 ) ) ); \
__mmask8 dm = _mm512_cmplt_epi64_mask( db, m512_zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
db = _mm512_ror_epi64( db, 1 ); \
tp += 8; \
} \
} while (0)
/*
#define INPUT_BIG8 \
do { \
__m512i db = *buf; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m512i dm = mm512_negate_64( _mm512_and_si512( db, m512_one_64 ) ); \
m0 = mm512_xorand( m0, dm, m512_const1_64( tp[0] ) ); \
m1 = mm512_xorand( m1, dm, m512_const1_64( tp[1] ) ); \
m2 = mm512_xorand( m2, dm, m512_const1_64( tp[2] ) ); \
@@ -572,6 +594,7 @@ do { \
db = _mm512_srli_epi64( db, 1 ); \
} \
} while (0)
*/
#define SBOX8( a, b, c, d ) \
do { \
@@ -888,13 +911,11 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
#define INPUT_BIG \
do { \
__m256i db = *buf; \
const uint64_t *tp = (uint64_t*)&T512[0][0]; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
dm = mm256_negate_32( _mm256_or_si256( dm, \
_mm256_slli_epi64( dm, 32 ) ) ); \
__m256i dm = mm256_negate_64( _mm256_and_si256( db, m256_one_64 ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
m256_const1_64( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \

View File

@@ -1544,7 +1544,6 @@ bool register_scrypt_algo( algo_gate_t* gate )
format_number_si( &t_size, t_units );
format_number_si( &d_size, d_units );
applog( LOG_INFO,"Throughput %d/thr, Buffer %.0f %siB/thr, Total %.0f %siB\n",
SCRYPT_THROUGHPUT, t_size, t_units, d_size, d_units );

View File

@@ -60,7 +60,14 @@ void x16r_8way_prehash( void *vdata, void *pdata )
case HAMSI:
mm512_bswap32_intrlv80_8x64( vdata, pdata );
hamsi512_8way_init( &x16r_ctx.hamsi );
hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 );
hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
mm128_bswap32_80( edata, pdata );
fugue512_init( &x16r_ctx.fugue );
fugue512_update( &x16r_ctx.fugue, edata, 76 );
intrlv_8x64( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
break;
case SHABAL:
mm256_bswap32_intrlv80_8x32( vdata2, pdata );
@@ -306,7 +313,7 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 );
else
{
intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -319,14 +326,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
hash7, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
fugue512_full( &ctx.fugue, hash4, in4, size );
fugue512_full( &ctx.fugue, hash5, in5, size );
fugue512_full( &ctx.fugue, hash6, in6, size );
fugue512_full( &ctx.fugue, hash7, in7, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in4 + 76, 4 );
fugue512_final( &ctx.fugue, hash4 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in5 + 76, 4 );
fugue512_final( &ctx.fugue, hash5 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in6 + 76, 4 );
fugue512_final( &ctx.fugue, hash6 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in7 + 76, 4 );
fugue512_final( &ctx.fugue, hash7 );
}
else
{
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
fugue512_full( &ctx.fugue, hash4, in4, size );
fugue512_full( &ctx.fugue, hash5, in5, size );
fugue512_full( &ctx.fugue, hash6, in6, size );
fugue512_full( &ctx.fugue, hash7, in7, size );
}
break;
case SHABAL:
intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
@@ -347,25 +383,25 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid )
{
sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash4 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash5 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash6 );
memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
memcpy( &ctx, &x16r_ctx, sizeof(sph_whirlpool_context) );
sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
sph_whirlpool_close( &ctx.whirlpool, hash7 );
}
@@ -532,7 +568,13 @@ void x16r_4way_prehash( void *vdata, void *pdata )
case HAMSI:
mm256_bswap32_intrlv80_4x64( vdata, pdata );
hamsi512_4way_init( &x16r_ctx.hamsi );
hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 );
hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 );
break;
case FUGUE:
mm128_bswap32_80( edata, pdata );
fugue512_init( &x16r_ctx.fugue );
fugue512_update( &x16r_ctx.fugue, edata, 76 );
intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
break;
case SHABAL:
mm128_bswap32_intrlv80_4x32( vdata2, pdata );
@@ -734,7 +776,7 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
break;
case HAMSI:
if ( i == 0 )
hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 );
else
{
intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
@@ -745,10 +787,27 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
break;
case FUGUE:
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
if ( i == 0 )
{
fugue512_update( &ctx.fugue, in0 + 76, 4 );
fugue512_final( &ctx.fugue, hash0 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in1 + 76, 4 );
fugue512_final( &ctx.fugue, hash1 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in2 + 76, 4 );
fugue512_final( &ctx.fugue, hash2 );
memcpy( &ctx, &x16r_ctx, sizeof(hashState_fugue) );
fugue512_update( &ctx.fugue, in3 + 76, 4 );
fugue512_final( &ctx.fugue, hash3 );
}
else
{
fugue512_full( &ctx.fugue, hash0, in0, size );
fugue512_full( &ctx.fugue, hash1, in1, size );
fugue512_full( &ctx.fugue, hash2, in2, size );
fugue512_full( &ctx.fugue, hash3, in3, size );
}
break;
case SHABAL:
intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );