mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.19.7
This commit is contained in:
@@ -594,9 +594,6 @@ void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
#define rb6(x) mm256_rol_64( x, 43 )
|
||||
#define rb7(x) mm256_rol_64( x, 53 )
|
||||
|
||||
#define rol_off_64( M, j ) \
|
||||
mm256_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
||||
_mm256_xor_si256( h, _mm256_add_epi64( K, \
|
||||
_mm256_sub_epi64( _mm256_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
@@ -732,8 +729,23 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
|
||||
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
|
||||
|
||||
__m256i mj[16];
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mj[i] = rol_off_64( M, i );
|
||||
|
||||
mj[ 0] = mm256_rol_64( M[ 0], 1 );
|
||||
mj[ 1] = mm256_rol_64( M[ 1], 2 );
|
||||
mj[ 2] = mm256_rol_64( M[ 2], 3 );
|
||||
mj[ 3] = mm256_rol_64( M[ 3], 4 );
|
||||
mj[ 4] = mm256_rol_64( M[ 4], 5 );
|
||||
mj[ 5] = mm256_rol_64( M[ 5], 6 );
|
||||
mj[ 6] = mm256_rol_64( M[ 6], 7 );
|
||||
mj[ 7] = mm256_rol_64( M[ 7], 8 );
|
||||
mj[ 8] = mm256_rol_64( M[ 8], 9 );
|
||||
mj[ 9] = mm256_rol_64( M[ 9], 10 );
|
||||
mj[10] = mm256_rol_64( M[10], 11 );
|
||||
mj[11] = mm256_rol_64( M[11], 12 );
|
||||
mj[12] = mm256_rol_64( M[12], 13 );
|
||||
mj[13] = mm256_rol_64( M[13], 14 );
|
||||
mj[14] = mm256_rol_64( M[14], 15 );
|
||||
mj[15] = mm256_rol_64( M[15], 16 );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m256i)_mm256_set1_epi64x( 16 * 0x0555555555555555ULL ) );
|
||||
@@ -1034,9 +1046,6 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
#define r8b6(x) mm512_rol_64( x, 43 )
|
||||
#define r8b7(x) mm512_rol_64( x, 53 )
|
||||
|
||||
#define rol8w_off_64( M, j ) \
|
||||
mm512_rol_64( M[ (j) & 0xF ], ( (j) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_b8( mj0, mj3, mj10, h, K ) \
|
||||
_mm512_xor_si512( h, _mm512_add_epi64( K, \
|
||||
_mm512_sub_epi64( _mm512_add_epi64( mj0, mj3 ), mj10 ) ) )
|
||||
@@ -1171,41 +1180,73 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
|
||||
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
|
||||
|
||||
__m512i mj[16];
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mj[i] = rol8w_off_64( M, i );
|
||||
uint64_t K = 16 * 0x0555555555555555ULL;
|
||||
|
||||
mj[ 0] = mm512_rol_64( M[ 0], 1 );
|
||||
mj[ 1] = mm512_rol_64( M[ 1], 2 );
|
||||
mj[ 2] = mm512_rol_64( M[ 2], 3 );
|
||||
mj[ 3] = mm512_rol_64( M[ 3], 4 );
|
||||
mj[ 4] = mm512_rol_64( M[ 4], 5 );
|
||||
mj[ 5] = mm512_rol_64( M[ 5], 6 );
|
||||
mj[ 6] = mm512_rol_64( M[ 6], 7 );
|
||||
mj[ 7] = mm512_rol_64( M[ 7], 8 );
|
||||
mj[ 8] = mm512_rol_64( M[ 8], 9 );
|
||||
mj[ 9] = mm512_rol_64( M[ 9], 10 );
|
||||
mj[10] = mm512_rol_64( M[10], 11 );
|
||||
mj[11] = mm512_rol_64( M[11], 12 );
|
||||
mj[12] = mm512_rol_64( M[12], 13 );
|
||||
mj[13] = mm512_rol_64( M[13], 14 );
|
||||
mj[14] = mm512_rol_64( M[14], 15 );
|
||||
mj[15] = mm512_rol_64( M[15], 16 );
|
||||
|
||||
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7],
|
||||
(const __m512i)_mm512_set1_epi64( 16 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[17] = add_elt_b8( mj[ 1], mj[ 4], mj[11], H[ 8],
|
||||
(const __m512i)_mm512_set1_epi64( 17 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[18] = add_elt_b8( mj[ 2], mj[ 5], mj[12], H[ 9],
|
||||
(const __m512i)_mm512_set1_epi64( 18 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[19] = add_elt_b8( mj[ 3], mj[ 6], mj[13], H[10],
|
||||
(const __m512i)_mm512_set1_epi64( 19 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[20] = add_elt_b8( mj[ 4], mj[ 7], mj[14], H[11],
|
||||
(const __m512i)_mm512_set1_epi64( 20 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[21] = add_elt_b8( mj[ 5], mj[ 8], mj[15], H[12],
|
||||
(const __m512i)_mm512_set1_epi64( 21 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[22] = add_elt_b8( mj[ 6], mj[ 9], mj[ 0], H[13],
|
||||
(const __m512i)_mm512_set1_epi64( 22 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[23] = add_elt_b8( mj[ 7], mj[10], mj[ 1], H[14],
|
||||
(const __m512i)_mm512_set1_epi64( 23 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[24] = add_elt_b8( mj[ 8], mj[11], mj[ 2], H[15],
|
||||
(const __m512i)_mm512_set1_epi64( 24 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[25] = add_elt_b8( mj[ 9], mj[12], mj[ 3], H[ 0],
|
||||
(const __m512i)_mm512_set1_epi64( 25 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[26] = add_elt_b8( mj[10], mj[13], mj[ 4], H[ 1],
|
||||
(const __m512i)_mm512_set1_epi64( 26 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[27] = add_elt_b8( mj[11], mj[14], mj[ 5], H[ 2],
|
||||
(const __m512i)_mm512_set1_epi64( 27 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[28] = add_elt_b8( mj[12], mj[15], mj[ 6], H[ 3],
|
||||
(const __m512i)_mm512_set1_epi64( 28 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[29] = add_elt_b8( mj[13], mj[ 0], mj[ 7], H[ 4],
|
||||
(const __m512i)_mm512_set1_epi64( 29 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[30] = add_elt_b8( mj[14], mj[ 1], mj[ 8], H[ 5],
|
||||
(const __m512i)_mm512_set1_epi64( 30 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
K += 0x0555555555555555ULL;
|
||||
qt[31] = add_elt_b8( mj[15], mj[ 2], mj[ 9], H[ 6],
|
||||
(const __m512i)_mm512_set1_epi64( 31 * 0x0555555555555555ULL ) );
|
||||
(const __m512i)_mm512_set1_epi64( K ) );
|
||||
|
||||
|
||||
qt[16] = _mm512_add_epi64( qt[16], expand1_b8( qt, 16 ) );
|
||||
qt[17] = _mm512_add_epi64( qt[17], expand1_b8( qt, 17 ) );
|
||||
|
||||
@@ -261,7 +261,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
// overlap it's unified.
|
||||
// As a result normal is Nrows-2 / Nrows.
|
||||
// for 4 rows: 1 unified, 2 overlap, 1 normal.
|
||||
// for 8 rows: 1 unified, 2 overlap, 56 normal.
|
||||
// for 8 rows: 1 unified, 2 overlap, 5 normal.
|
||||
|
||||
static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||
uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
|
||||
@@ -283,6 +283,15 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
io0 = _mm512_load_si512( inout0 );
|
||||
io1 = _mm512_load_si512( inout0 +1 );
|
||||
io2 = _mm512_load_si512( inout0 +2 );
|
||||
|
||||
io0 = _mm512_mask_load_epi64( io0, 0xf0, inout1 );
|
||||
io1 = _mm512_mask_load_epi64( io1, 0xf0, inout1 +1 );
|
||||
io2 = _mm512_mask_load_epi64( io2, 0xf0, inout1 +2 );
|
||||
|
||||
/*
|
||||
io0 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||
@@ -292,6 +301,7 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||
io2 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||
*/
|
||||
|
||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
|
||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
|
||||
@@ -359,6 +369,15 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
io0.v512 = _mm512_load_si512( inout0 );
|
||||
io1.v512 = _mm512_load_si512( inout0 +1 );
|
||||
io2.v512 = _mm512_load_si512( inout0 +2 );
|
||||
|
||||
io0.v512 = _mm512_mask_load_epi64( io0.v512, 0xf0, inout1 );
|
||||
io1.v512 = _mm512_mask_load_epi64( io1.v512, 0xf0, inout1 +1 );
|
||||
io2.v512 = _mm512_mask_load_epi64( io2.v512, 0xf0, inout1 +2 );
|
||||
|
||||
/*
|
||||
io0.v512 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||
@@ -368,27 +387,12 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
io2.v512 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||
*/
|
||||
|
||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0.v512 ) );
|
||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1.v512 ) );
|
||||
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2.v512 ) );
|
||||
|
||||
/*
|
||||
io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||
io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +1 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +1 ) );
|
||||
io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||
|
||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
|
||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
|
||||
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
|
||||
*/
|
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
@@ -415,22 +419,6 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
io2.v512 = _mm512_mask_blend_epi64( 0xf0, io2.v512, out[2] );
|
||||
}
|
||||
|
||||
/*
|
||||
if ( rowOut == rowInOut0 )
|
||||
{
|
||||
io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
|
||||
io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
|
||||
io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
|
||||
|
||||
}
|
||||
if ( rowOut == rowInOut1 )
|
||||
{
|
||||
io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
|
||||
io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
|
||||
io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
|
||||
}
|
||||
*/
|
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
@@ -444,12 +432,23 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
_mm512_mask_blend_epi64( 0x11, t2, t1 ) );
|
||||
}
|
||||
|
||||
/*
|
||||
casti_m256i( inout0, 0 ) = _mm512_castsi512_si256( io0.v512 );
|
||||
casti_m256i( inout0, 2 ) = _mm512_castsi512_si256( io1.v512 );
|
||||
casti_m256i( inout0, 4 ) = _mm512_castsi512_si256( io2.v512 );
|
||||
_mm512_mask_store_epi64( inout1, 0xf0, io0.v512 );
|
||||
_mm512_mask_store_epi64( inout1 +1, 0xf0, io1.v512 );
|
||||
_mm512_mask_store_epi64( inout1 +2, 0xf0, io2.v512 );
|
||||
*/
|
||||
|
||||
|
||||
casti_m256i( inout0, 0 ) = io0.v256lo;
|
||||
casti_m256i( inout1, 1 ) = io0.v256hi;
|
||||
casti_m256i( inout0, 2 ) = io1.v256lo;
|
||||
casti_m256i( inout1, 3 ) = io1.v256hi;
|
||||
casti_m256i( inout0, 4 ) = io2.v256lo;
|
||||
casti_m256i( inout1, 5 ) = io2.v256hi;
|
||||
|
||||
/*
|
||||
_mm512_mask_store_epi64( inout0, 0x0f, io.v512[0] );
|
||||
_mm512_mask_store_epi64( inout1, 0xf0, io.v512[0] );
|
||||
|
||||
Reference in New Issue
Block a user