mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.10
This commit is contained in:
@@ -429,7 +429,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ); \
|
||||
0x0405060700010203 ); \
|
||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
||||
M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
|
||||
@@ -931,14 +931,14 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const v128_t shuf_bswap32 =
|
||||
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -131,47 +131,7 @@
|
||||
V[7] = v128_alignr64( V6, V7, 1 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_shuflr64_24( _mm_xor_si128( Vb, Vc ) ); \
|
||||
\
|
||||
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
|
||||
_mm_set_epi64x( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
|
||||
Vc = _mm_add_epi64( Vc, Vd ); \
|
||||
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
v128_t *V = (v128_t*)v; \
|
||||
v128_t V2, V3, V6, V7; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
V2 = mm128_alignr_64( V[3], V[2], 1 ); \
|
||||
V3 = mm128_alignr_64( V[2], V[3], 1 ); \
|
||||
V6 = mm128_alignr_64( V[6], V[7], 1 ); \
|
||||
V7 = mm128_alignr_64( V[7], V[6], 1 ); \
|
||||
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
|
||||
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
|
||||
V[2] = mm128_alignr_64( V2, V3, 1 ); \
|
||||
V[3] = mm128_alignr_64( V3, V2, 1 ); \
|
||||
V[6] = mm128_alignr_64( V7, V6, 1 ); \
|
||||
V[7] = mm128_alignr_64( V6, V7, 1 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
#else
|
||||
// never used, SSE2 is always available
|
||||
|
||||
#ifndef ROTR64
|
||||
#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
|
||||
|
||||
@@ -62,78 +62,78 @@ static const uint32_t IV256[] = {
|
||||
*/
|
||||
|
||||
#define ss0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
||||
_mm_slli_epi32( (x), 3) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 4), \
|
||||
mm128_rol_32( (x), 19) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||
v128_sl32( (x), 3) ), \
|
||||
v128_xor( v128_rol32( (x), 4), \
|
||||
v128_rol32( (x), 19) ) )
|
||||
|
||||
#define ss1(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
|
||||
_mm_slli_epi32( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 8), \
|
||||
mm128_rol_32( (x), 23) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 1), \
|
||||
v128_sl32( (x), 2) ), \
|
||||
v128_xor( v128_rol32( (x), 8), \
|
||||
v128_rol32( (x), 23) ) )
|
||||
|
||||
#define ss2(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
||||
_mm_slli_epi32( (x), 1) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 12), \
|
||||
mm128_rol_32( (x), 25) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||
v128_sl32( (x), 1) ), \
|
||||
v128_xor( v128_rol32( (x), 12), \
|
||||
v128_rol32( (x), 25) ) )
|
||||
|
||||
#define ss3(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
|
||||
_mm_slli_epi32( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_32( (x), 15), \
|
||||
mm128_rol_32( (x), 29) ) )
|
||||
v128_xor( v128_xor( v128_sr32( (x), 2), \
|
||||
v128_sl32( (x), 2) ), \
|
||||
v128_xor( v128_rol32( (x), 15), \
|
||||
v128_rol32( (x), 29) ) )
|
||||
|
||||
#define ss4(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
|
||||
v128_xor( (x), v128_sr32( (x), 1 ) )
|
||||
|
||||
#define ss5(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
|
||||
v128_xor( (x), v128_sr32( (x), 2 ) )
|
||||
|
||||
#define rs1(x) mm128_rol_32( x, 3 )
|
||||
#define rs2(x) mm128_rol_32( x, 7 )
|
||||
#define rs3(x) mm128_rol_32( x, 13 )
|
||||
#define rs4(x) mm128_rol_32( x, 16 )
|
||||
#define rs5(x) mm128_rol_32( x, 19 )
|
||||
#define rs6(x) mm128_rol_32( x, 23 )
|
||||
#define rs7(x) mm128_rol_32( x, 27 )
|
||||
#define rs1(x) v128_rol32( x, 3 )
|
||||
#define rs2(x) v128_rol32( x, 7 )
|
||||
#define rs3(x) v128_rol32( x, 13 )
|
||||
#define rs4(x) v128_rol32( x, 16 )
|
||||
#define rs5(x) v128_rol32( x, 19 )
|
||||
#define rs6(x) v128_rol32( x, 23 )
|
||||
#define rs7(x) v128_rol32( x, 27 )
|
||||
|
||||
#define rol_off_32( M, j, off ) \
|
||||
mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
v128_rol32( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_s( M, H, j ) \
|
||||
_mm_xor_si128( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
||||
v128_xor( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_add32( rol_off_32( M, j, 0 ), \
|
||||
rol_off_32( M, j, 3 ) ), \
|
||||
rol_off_32( M, j, 10 ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
v128_32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
#define expand1s( qt, M, H, i ) \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||
v128_add32( v128_add4_32( \
|
||||
v128_add4_32( ss1( qt[ (i)-16 ] ), ss2( qt[ (i)-15 ] ), \
|
||||
ss3( qt[ (i)-14 ] ), ss0( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)-12 ] ), ss2( qt[ (i)-11 ] ), \
|
||||
ss3( qt[ (i)-10 ] ), ss0( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)- 8 ] ), ss2( qt[ (i)- 7 ] ), \
|
||||
ss3( qt[ (i)- 6 ] ), ss0( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||
v128_add4_32( ss1( qt[ (i)- 4 ] ), ss2( qt[ (i)- 3 ] ), \
|
||||
ss3( qt[ (i)- 2 ] ), ss0( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2s( qt, M, H, i) \
|
||||
_mm_add_epi32( mm128_add4_32( \
|
||||
mm128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||
v128_add32( v128_add4_32( \
|
||||
v128_add4_32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||
v128_add4_32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||
v128_add4_32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ), \
|
||||
mm128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||
v128_add4_32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ), \
|
||||
ss4( qt[ (i)- 2 ] ), ss5( qt[ (i)- 1 ] ) ) ), \
|
||||
add_elt_s( M, H, (i)-16 ) )
|
||||
|
||||
@@ -141,169 +141,169 @@ static const uint32_t IV256[] = {
|
||||
// resulting in some sign changes compared to the reference code.
|
||||
|
||||
#define Ws0 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_xor( M[10], H[10] ) ), \
|
||||
v128_add32( v128_xor( M[13], H[13] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws1 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[14], H[14] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 6], H[ 6] ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_xor( M[11], H[11] ) ), \
|
||||
v128_sub32( v128_xor( M[14], H[14] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws2 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ), \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws3 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 1], H[ 1] ) ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_sub32( v128_xor( M[10], H[10] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws4 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ), \
|
||||
v128_add32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws5 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_xor( M[10], H[10] ) ), \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws6 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 4], H[ 4] ), \
|
||||
v128_xor( M[ 0], H[ 0] ) ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_sub32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
#define Ws7 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_add32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws8 \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[13], H[13] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_add32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 2], H[ 2] ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[13], H[13] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
#define Ws9 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 0], H[ 0] ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||
v128_xor( M[14], H[14] ) ) )
|
||||
|
||||
#define Ws10 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 7], H[ 7] ), \
|
||||
_mm_xor_si128( M[15], H[15] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||
v128_xor( M[ 1], H[ 1] ) ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_sub32( v128_xor( M[ 7], H[ 7] ), \
|
||||
v128_xor( M[15], H[15] ) ) )
|
||||
|
||||
#define Ws11 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[ 8], H[ 8] ), \
|
||||
v128_xor( M[ 0], H[ 0] ) ), \
|
||||
v128_xor( M[ 2], H[ 2] ) ), \
|
||||
v128_sub32( v128_xor( M[ 5], H[ 5] ), \
|
||||
v128_xor( M[ 9], H[ 9] ) ) )
|
||||
|
||||
#define Ws12 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_add32( v128_xor( M[ 1], H[ 1] ), \
|
||||
v128_xor( M[ 3], H[ 3] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||
v128_xor( M[10], H[10] ) ) )
|
||||
|
||||
#define Ws13 \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[10], H[10] ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ) )
|
||||
v128_add32( \
|
||||
v128_add32( \
|
||||
v128_add32( v128_xor( M[ 2], H[ 2] ), \
|
||||
v128_xor( M[ 4], H[ 4] ) ), \
|
||||
v128_xor( M[ 7], H[ 7] ) ), \
|
||||
v128_add32( v128_xor( M[10], H[10] ), \
|
||||
v128_xor( M[11], H[11] ) ) )
|
||||
|
||||
#define Ws14 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_add_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_add_epi32( _mm_xor_si128( M[11], H[11] ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ) )
|
||||
v128_sub32( \
|
||||
v128_add32( \
|
||||
v128_sub32( v128_xor( M[ 3], H[ 3] ), \
|
||||
v128_xor( M[ 5], H[ 5] ) ), \
|
||||
v128_xor( M[ 8], H[ 8] ) ), \
|
||||
v128_add32( v128_xor( M[11], H[11] ), \
|
||||
v128_xor( M[12], H[12] ) ) )
|
||||
|
||||
#define Ws15 \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[ 4], H[4] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_sub_epi32( _mm_xor_si128( M[ 9], H[ 9] ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ) )
|
||||
v128_sub32( \
|
||||
v128_sub32( \
|
||||
v128_sub32( v128_xor( M[12], H[12] ), \
|
||||
v128_xor( M[ 4], H[4] ) ), \
|
||||
v128_xor( M[ 6], H[ 6] ) ), \
|
||||
v128_sub32( v128_xor( M[ 9], H[ 9] ), \
|
||||
v128_xor( M[13], H[13] ) ) )
|
||||
|
||||
|
||||
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
|
||||
{
|
||||
__m128i qt[32], xl, xh; \
|
||||
v128u64_t qt[32], xl, xh; \
|
||||
|
||||
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
|
||||
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
|
||||
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
|
||||
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
|
||||
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
|
||||
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
|
||||
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
|
||||
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
|
||||
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
|
||||
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
|
||||
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
|
||||
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
|
||||
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
|
||||
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
|
||||
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
|
||||
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
|
||||
qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] );
|
||||
qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] );
|
||||
qt[ 2] = v128_add32( ss2( Ws2 ), H[ 3] );
|
||||
qt[ 3] = v128_add32( ss3( Ws3 ), H[ 4] );
|
||||
qt[ 4] = v128_add32( ss4( Ws4 ), H[ 5] );
|
||||
qt[ 5] = v128_add32( ss0( Ws5 ), H[ 6] );
|
||||
qt[ 6] = v128_add32( ss1( Ws6 ), H[ 7] );
|
||||
qt[ 7] = v128_add32( ss2( Ws7 ), H[ 8] );
|
||||
qt[ 8] = v128_add32( ss3( Ws8 ), H[ 9] );
|
||||
qt[ 9] = v128_add32( ss4( Ws9 ), H[10] );
|
||||
qt[10] = v128_add32( ss0( Ws10), H[11] );
|
||||
qt[11] = v128_add32( ss1( Ws11), H[12] );
|
||||
qt[12] = v128_add32( ss2( Ws12), H[13] );
|
||||
qt[13] = v128_add32( ss3( Ws13), H[14] );
|
||||
qt[14] = v128_add32( ss4( Ws14), H[15] );
|
||||
qt[15] = v128_add32( ss0( Ws15), H[ 0] );
|
||||
qt[16] = expand1s( qt, M, H, 16 );
|
||||
qt[17] = expand1s( qt, M, H, 17 );
|
||||
qt[18] = expand2s( qt, M, H, 18 );
|
||||
@@ -321,92 +321,92 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
|
||||
qt[30] = expand2s( qt, M, H, 30 );
|
||||
qt[31] = expand2s( qt, M, H, 31 );
|
||||
|
||||
xl = _mm_xor_si128( mm128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
mm128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = _mm_xor_si128( xl, _mm_xor_si128(
|
||||
mm128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
mm128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
xl = v128_xor( v128_xor4( qt[16], qt[17], qt[18], qt[19] ),
|
||||
v128_xor4( qt[20], qt[21], qt[22], qt[23] ) );
|
||||
xh = v128_xor( xl, v128_xor(
|
||||
v128_xor4( qt[24], qt[25], qt[26], qt[27] ),
|
||||
v128_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );
|
||||
|
||||
dH[ 0] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[0],
|
||||
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
|
||||
_mm_srli_epi32( qt[16], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[1],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
|
||||
_mm_slli_epi32( qt[17], 8 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[2],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
|
||||
_mm_slli_epi32( qt[18], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[3],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
|
||||
_mm_slli_epi32( qt[19], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[4],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
|
||||
_mm_slli_epi32( qt[20], 0 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[5],
|
||||
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
|
||||
_mm_srli_epi32( qt[21], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[6],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
|
||||
_mm_slli_epi32( qt[22], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = _mm_add_epi32(
|
||||
_mm_xor_si128( M[7],
|
||||
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
|
||||
_mm_slli_epi32( qt[23], 2 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[4], 9 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
|
||||
_mm_xor_si128( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[5], 10 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
|
||||
_mm_xor_si128( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[6], 11 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
|
||||
_mm_xor_si128( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[7], 12 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
|
||||
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
|
||||
_mm_xor_si128( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[0], 13 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
|
||||
_mm_xor_si128( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[1], 14 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
|
||||
_mm_xor_si128( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[2], 15 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
|
||||
_mm_xor_si128( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm_add_epi32( _mm_add_epi32(
|
||||
mm128_rol_32( dH[3], 16 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
|
||||
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
|
||||
_mm_xor_si128( qt[22], qt[15] ) ) );
|
||||
dH[ 0] = v128_add32(
|
||||
v128_xor( M[0],
|
||||
v128_xor( v128_sl32( xh, 5 ),
|
||||
v128_sr32( qt[16], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[24] ), qt[ 0] ));
|
||||
dH[ 1] = v128_add32(
|
||||
v128_xor( M[1],
|
||||
v128_xor( v128_sr32( xh, 7 ),
|
||||
v128_sl32( qt[17], 8 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[25] ), qt[ 1] ));
|
||||
dH[ 2] = v128_add32(
|
||||
v128_xor( M[2],
|
||||
v128_xor( v128_sr32( xh, 5 ),
|
||||
v128_sl32( qt[18], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[26] ), qt[ 2] ));
|
||||
dH[ 3] = v128_add32(
|
||||
v128_xor( M[3],
|
||||
v128_xor( v128_sr32( xh, 1 ),
|
||||
v128_sl32( qt[19], 5 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[27] ), qt[ 3] ));
|
||||
dH[ 4] = v128_add32(
|
||||
v128_xor( M[4],
|
||||
v128_xor( v128_sr32( xh, 3 ),
|
||||
v128_sl32( qt[20], 0 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[28] ), qt[ 4] ));
|
||||
dH[ 5] = v128_add32(
|
||||
v128_xor( M[5],
|
||||
v128_xor( v128_sl32( xh, 6 ),
|
||||
v128_sr32( qt[21], 6 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[29] ), qt[ 5] ));
|
||||
dH[ 6] = v128_add32(
|
||||
v128_xor( M[6],
|
||||
v128_xor( v128_sr32( xh, 4 ),
|
||||
v128_sl32( qt[22], 6 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[30] ), qt[ 6] ));
|
||||
dH[ 7] = v128_add32(
|
||||
v128_xor( M[7],
|
||||
v128_xor( v128_sr32( xh, 11 ),
|
||||
v128_sl32( qt[23], 2 ) ) ),
|
||||
v128_xor( v128_xor( xl, qt[31] ), qt[ 7] ));
|
||||
dH[ 8] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[4], 9 ),
|
||||
v128_xor( v128_xor( xh, qt[24] ), M[ 8] )),
|
||||
v128_xor( v128_sl32( xl, 8 ),
|
||||
v128_xor( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[5], 10 ),
|
||||
v128_xor( v128_xor( xh, qt[25] ), M[ 9] )),
|
||||
v128_xor( v128_sr32( xl, 6 ),
|
||||
v128_xor( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[6], 11 ),
|
||||
v128_xor( v128_xor( xh, qt[26] ), M[10] )),
|
||||
v128_xor( v128_sl32( xl, 6 ),
|
||||
v128_xor( qt[17], qt[10] ) ) );
|
||||
dH[11] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[7], 12 ),
|
||||
v128_xor( v128_xor( xh, qt[27] ), M[11] )),
|
||||
v128_xor( v128_sl32( xl, 4 ),
|
||||
v128_xor( qt[18], qt[11] ) ) );
|
||||
dH[12] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[0], 13 ),
|
||||
v128_xor( v128_xor( xh, qt[28] ), M[12] )),
|
||||
v128_xor( v128_sr32( xl, 3 ),
|
||||
v128_xor( qt[19], qt[12] ) ) );
|
||||
dH[13] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[1], 14 ),
|
||||
v128_xor( v128_xor( xh, qt[29] ), M[13] )),
|
||||
v128_xor( v128_sr32( xl, 4 ),
|
||||
v128_xor( qt[20], qt[13] ) ) );
|
||||
dH[14] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[2], 15 ),
|
||||
v128_xor( v128_xor( xh, qt[30] ), M[14] )),
|
||||
v128_xor( v128_sr32( xl, 7 ),
|
||||
v128_xor( qt[21], qt[14] ) ) );
|
||||
dH[15] = v128_add32( v128_add32(
|
||||
v128_rol32( dH[3], 16 ),
|
||||
v128_xor( v128_xor( xh, qt[31] ), M[15] )),
|
||||
v128_xor( v128_sr32( xl, 2 ),
|
||||
v128_xor( qt[22], qt[15] ) ) );
|
||||
}
|
||||
|
||||
static const uint32_t final_s[16][4] =
|
||||
@@ -429,7 +429,7 @@ static const uint32_t final_s[16][4] =
|
||||
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
|
||||
};
|
||||
/*
|
||||
static const __m128i final_s[16] =
|
||||
static const v128u64_t final_s[16] =
|
||||
{
|
||||
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
|
||||
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
|
||||
@@ -451,26 +451,26 @@ static const __m128i final_s[16] =
|
||||
*/
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||
{
|
||||
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
|
||||
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
|
||||
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
|
||||
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
|
||||
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
|
||||
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
|
||||
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
|
||||
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
|
||||
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->H[ 0] = v128_64( 0x4041424340414243 );
|
||||
ctx->H[ 1] = v128_64( 0x4445464744454647 );
|
||||
ctx->H[ 2] = v128_64( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = v128_64( 0x5051525350515253 );
|
||||
ctx->H[ 5] = v128_64( 0x5455565754555657 );
|
||||
ctx->H[ 6] = v128_64( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = v128_64( 0x6061626360616263 );
|
||||
ctx->H[ 9] = v128_64( 0x6465666764656667 );
|
||||
ctx->H[10] = v128_64( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = v128_64( 0x7071727370717273 );
|
||||
ctx->H[13] = v128_64( 0x7475767774757677 );
|
||||
ctx->H[14] = v128_64( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F );
|
||||
|
||||
|
||||
// for ( int i = 0; i < 16; i++ )
|
||||
// sc->H[i] = _mm_set1_epi32( iv[i] );
|
||||
// sc->H[i] = v128_32( iv[i] );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
@@ -478,10 +478,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||
static void
|
||||
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
{
|
||||
__m128i *vdata = (__m128i*)data;
|
||||
__m128i *buf;
|
||||
__m128i htmp[16];
|
||||
__m128i *h1, *h2;
|
||||
v128u64_t *vdata = (v128u64_t*)data;
|
||||
v128u64_t *buf;
|
||||
v128u64_t htmp[16];
|
||||
v128u64_t *h1, *h2;
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
@@ -497,13 +497,13 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
|
||||
v128_memcpy( buf + (ptr>>2), vdata, clen >> 2 );
|
||||
vdata += ( clen >> 2 );
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m128i *ht;
|
||||
v128u64_t *ht;
|
||||
compress_small( buf, h1, h2 );
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
@@ -513,46 +513,45 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
}
|
||||
sc->ptr = ptr;
|
||||
|
||||
|
||||
if ( h1 != sc->H )
|
||||
memcpy_128( sc->H, h1, 16 );
|
||||
v128_memcpy( sc->H, h1, 16 );
|
||||
}
|
||||
|
||||
static void
|
||||
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32)
|
||||
{
|
||||
__m128i *buf;
|
||||
__m128i h1[16], h2[16], *h;
|
||||
v128u64_t *buf;
|
||||
v128u64_t h1[16], h2[16], *h;
|
||||
size_t ptr, u, v;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
h = sc->H;
|
||||
|
||||
// assume bit_count fits in 32 bits
|
||||
if ( ptr > buf_size - 4 )
|
||||
{
|
||||
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
v128_memset_zero( buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
compress_small( buf, h, h1 );
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
|
||||
buf[ (buf_size - 4) >> 2 ] = m128_zero;
|
||||
v128_memset_zero( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
|
||||
buf[ (buf_size - 8) >> 2 ] = v128_32( sc->bit_count + n );
|
||||
buf[ (buf_size - 4) >> 2 ] = v128_zero;
|
||||
compress_small( buf, h, h2 );
|
||||
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[u] = h2[u];
|
||||
|
||||
compress_small( buf, (__m128i*)final_s, h1 );
|
||||
compress_small( buf, (v128u64_t*)final_s, h1 );
|
||||
|
||||
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
|
||||
casti_m128i( dst, u ) = h1[v];
|
||||
casti_v128( dst, u ) = h1[v];
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -146,7 +146,7 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
|
||||
#define SUBSTITUTE(r0, _t2 )\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, m128_zero )
|
||||
_t2 = _mm_aesenclast_si128( _t2, v128_zero )
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
@@ -162,16 +162,16 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = v128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = mm128_xor3( t4, t1, t2 ); \
|
||||
t4 = v128_xor3( t4, t1, t2 ); \
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t4 = mm128_xor3( t4, t2, t1 ); \
|
||||
t4 = v128_xor3( t4, t2, t1 ); \
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = mm128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
|
||||
/*
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
@@ -188,7 +188,7 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = mm128_xor3(t2, t3, t0 );\
|
||||
t2 = v128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
@@ -485,7 +485,7 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
||||
ctx->uBlockLength = 4;
|
||||
|
||||
for(i = 0; i < 6; i++)
|
||||
ctx->state[i] = m128_zero;
|
||||
ctx->state[i] = v128_zero;
|
||||
|
||||
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
|
||||
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
|
||||
|
||||
@@ -66,7 +66,40 @@ static const v128u32_t BLEND_MASK = { 0xffffffff, 0, 0, 0xffffffff };
|
||||
|
||||
#define gr_shuffle32( v ) v128_blendv( v128_qrev32( v ), v, BLEND_MASK )
|
||||
|
||||
//#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
||||
/*
|
||||
#define TRANSP_MASK \
|
||||
0xd,0x5,0x9,0x1,0xc,0x4,0x8,0x0,0xf,0x7,0xb,0x3,0xe,0x6,0xa,0x2
|
||||
#define SUBSH_MASK0 \
|
||||
0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8
|
||||
#define SUBSH_MASK1 \
|
||||
0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9
|
||||
#define SUBSH_MASK2 \
|
||||
0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa
|
||||
#define SUBSH_MASK3 \
|
||||
0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb
|
||||
#define SUBSH_MASK4 \
|
||||
0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc
|
||||
#define SUBSH_MASK5 \
|
||||
0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd
|
||||
#define SUBSH_MASK6 \
|
||||
0x1,0x4,0x7,0xa,0xd,0x0,0x3,0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe
|
||||
#define SUBSH_MASK7 \
|
||||
0x6,0x9,0xc,0xf,0x2,0x5,0x8,0xb,0xe,0x1,0x4,0x7,0xa,0xd,0x0,0x3
|
||||
|
||||
//#define gr_shuffle8( v, c ) v128_shullfev8( v, c )
|
||||
|
||||
|
||||
#define gr_shuffle8( v, c15, c14, c13, c12, c11, c10, c09, c08, \
|
||||
c07, c06, c05, c04, c03, c02, c01, c00 ) \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v128_movlane8( v128_movlane8( v128_movlane8( v128_movlane8( \
|
||||
v, 15, v, c15 ), 14, v, c14 ), 13, v, c13 ), 12, v, c12 ), \
|
||||
11, v, c11 ), 10, v, c10 ), 9, v, c09 ), 8, v, c08 ), \
|
||||
7, v, c07 ), 6, v, c06 ), 5, v, c05 ), 4, v, c04 ), \
|
||||
3, v, c03 ), 2, v, c02 ), 1, v, c01 ), 0, v, c00 )
|
||||
*/
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -626,7 +626,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
|
||||
#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
|
||||
b1 = mm256_bcast_m128( mm128_mask_32( v128_neg1, 0x3 ) ); \
|
||||
a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
|
||||
a1 = _mm256_xor_si256( a1, b1 );\
|
||||
a2 = _mm256_xor_si256( a2, b1 );\
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
// SSE2 or NEON Hamsi-512 2x64
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -57,6 +57,8 @@ void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_2x64( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Hamsi-512 4x64
|
||||
|
||||
@@ -75,16 +75,16 @@
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
v128_t t = a0; \
|
||||
a0 = mm128_xoror( a3, a0, a1 ); \
|
||||
a0 = v128_xoror( a3, a0, a1 ); \
|
||||
a2 = v128_xor( a2, a3 ); \
|
||||
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm128_xorand( a2, a3, t ); \
|
||||
a2 = mm128_xorand( a1, a2, a0 ); \
|
||||
a3 = v128_xorand( a2, a3, t ); \
|
||||
a2 = v128_xorand( a1, a2, a0 ); \
|
||||
a1 = v128_or( a1, a3 ); \
|
||||
a3 = v128_xor( a3, a2 ); \
|
||||
t = v128_xor( t, a1 ); \
|
||||
a2 = v128_and( a2, a1 ); \
|
||||
a1 = mm128_xnor( a1, a0 ); \
|
||||
a1 = v128_xnor( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
|
||||
@@ -35,13 +35,13 @@ static const uint32_t IV[5] =
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
|
||||
|
||||
#define F3(x, y, z) \
|
||||
_mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )
|
||||
_mm_xor_si128( _mm_or_si128( x, v128_not( y ) ), z )
|
||||
|
||||
#define F4(x, y, z) \
|
||||
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
|
||||
|
||||
#define F5(x, y, z) \
|
||||
_mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )
|
||||
_mm_xor_si128( x, _mm_or_si128( y, v128_not( z ) ) )
|
||||
|
||||
#define RR(a, b, c, d, e, f, s, r, k) \
|
||||
do{ \
|
||||
|
||||
@@ -319,7 +319,7 @@ int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
|
||||
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
|
||||
v128_t vmask, targ, hash;
|
||||
int t6_mask, flip;
|
||||
v128_t W[16]; memcpy_128( W, data, 16 );
|
||||
v128_t W[16]; v128_memcpy( W, data, 16 );
|
||||
|
||||
A = v128_load( state_in );
|
||||
B = v128_load( state_in+1 );
|
||||
|
||||
@@ -5,11 +5,11 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA512256D_8WAY 1
|
||||
#define SHA512256D_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA512256D_4WAY 1
|
||||
#define SHA512256D_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SHA512256D_2WAY 1
|
||||
#define SHA512256D_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SHA512256D_8WAY)
|
||||
@@ -110,14 +110,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i four = v256_64( 0x0000000400000000 );
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
casti_m256i( vdata,9 ) = mm256_intrlv_blend_32( _mm256_set_epi32(
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
|
||||
do
|
||||
{
|
||||
sha512256d_4way_init( &ctx );
|
||||
@@ -138,7 +137,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, four );
|
||||
casti_m256i( vdata,9 ) = _mm256_add_epi32( casti_m256i( vdata,9 ), four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
@@ -180,11 +179,10 @@ int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
|
||||
v128u64_t *noncev = (v128u64_t*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const v128u64_t two = v128_64( 0x0000000200000000 );
|
||||
const v128_t two = v128_64( 0x0000000200000000 );
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
*noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev );
|
||||
// *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -279,7 +277,7 @@ int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
|
||||
|
||||
bool register_sha512256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#if defined(SHA512256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha512256d_8way;
|
||||
#elif defined(SHA512256D_4WAY)
|
||||
|
||||
@@ -71,7 +71,7 @@ static const uint32_t IV512[] =
|
||||
static void
|
||||
c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
{
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const v128_t zero = v128_zero;
|
||||
__m256i p0, p1, p2, p3, x;
|
||||
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
|
||||
__m256i *m = (__m256i*)msg;
|
||||
@@ -278,7 +278,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
|
||||
void shavite512_2way_init( shavite512_2way_context *ctx )
|
||||
{
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
v128_t *iv = (v128_t*)IV512;
|
||||
|
||||
h[0] = mm256_bcast_m128( iv[0] );
|
||||
h[1] = mm256_bcast_m128( iv[1] );
|
||||
@@ -358,7 +358,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -434,7 +434,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -451,7 +451,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
__m256i *h = (__m256i*)ctx->h;
|
||||
__m128i *iv = (__m128i*)IV512;
|
||||
v128_t *iv = (v128_t*)IV512;
|
||||
|
||||
h[0] = mm256_bcast_m128( iv[0] );
|
||||
h[1] = mm256_bcast_m128( iv[1] );
|
||||
@@ -524,7 +524,7 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m256i( buf, 6 ) = mm256_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
@@ -303,7 +303,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
|
||||
count.u32[3] = ctx->count3;
|
||||
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -379,7 +379,7 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
@@ -470,7 +470,7 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
|
||||
}
|
||||
|
||||
casti_m512i( buf, 6 ) = mm512_bcast_m128(
|
||||
_mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
|
||||
_mm_insert_epi16( v128_zero, count.u16[0], 7 ) );
|
||||
casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
|
||||
0x0200, count.u16[7], count.u16[6], count.u16[5],
|
||||
count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
|
||||
|
||||
@@ -159,4 +159,69 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SKEIN_2WAY)
|
||||
|
||||
static __thread skein512_2x64_context skein512_2x64_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void skeinhash_2x64( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhash64[8*2] __attribute__ ((aligned (32)));
|
||||
uint32_t hash0[16] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[16] __attribute__ ((aligned (32)));
|
||||
skein512_2x64_context ctx_skein;
|
||||
memcpy( &ctx_skein, &skein512_2x64_ctx, sizeof( ctx_skein ) );
|
||||
|
||||
skein512_2x64_final16( &ctx_skein, vhash64, input + (64*2) );
|
||||
|
||||
dintrlv_2x64( hash0, hash1, vhash64, 512 );
|
||||
|
||||
sha256_full( hash0, hash0, 64 );
|
||||
sha256_full( hash1, hash1, 64 );
|
||||
|
||||
intrlv_2x32( state, hash0, hash1, 256 );
|
||||
}
|
||||
|
||||
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*2] __attribute__ ((aligned (32)));
|
||||
uint32_t hash[8*2] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash_d7 = &(hash[7<<1]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t targ_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
v128u32_t *noncev = (v128u32_t*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
skein512_2x64_prehash64( &skein512_2x64_ctx, vdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
skeinhash_2x64( hash, vdata );
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( unlikely( ( hash_d7[ lane ] <= targ_d7 ) && !bench ) )
|
||||
{
|
||||
extr_lane_2x32( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, v128_64( 0x0000000200000000 ) );
|
||||
n += 2;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3,16 +3,20 @@
|
||||
|
||||
bool register_skein_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SKEIN_8WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_8way;
|
||||
gate->hash = (void*)&skeinhash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
#elif defined(SKEIN_4WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_4way;
|
||||
gate->hash = (void*)&skeinhash_4way;
|
||||
#elif defined(SKEIN_2WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein_2x64;
|
||||
gate->hash = (void*)&skeinhash_2x64;
|
||||
#else
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT;
|
||||
gate->scanhash = (void*)&scanhash_skein;
|
||||
gate->hash = (void*)&skeinhash;
|
||||
#endif
|
||||
@@ -21,16 +25,15 @@ bool register_skein_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_skein2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#if defined (SKEIN_8WAY)
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
#if defined(SKEIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_8way;
|
||||
gate->hash = (void*)&skein2hash_8way;
|
||||
#elif defined (SKEIN_4WAY)
|
||||
#elif defined(SKEIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_4way;
|
||||
gate->hash = (void*)&skein2hash_4way;
|
||||
#elif defined(SKEIN_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_skein2_2x64;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_skein2;
|
||||
gate->hash = (void*)&skein2hash;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
#define SKEIN_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SKEIN_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SKEIN_2WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SKEIN_8WAY)
|
||||
@@ -29,6 +31,16 @@ void skein2hash_4way( void *output, const void *input );
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(SKEIN_2WAY)
|
||||
|
||||
void skeinhash_2x64( void *output, const void *input );
|
||||
int scanhash_skein_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void skein2hash_2x64( void *output, const void *input );
|
||||
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t* hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void skeinhash( void *output, const void *input );
|
||||
|
||||
@@ -675,11 +675,13 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
|
||||
|
||||
// Close
|
||||
|
||||
unsigned et;
|
||||
|
||||
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_8WAY( et, ptr );
|
||||
if ( ptr )
|
||||
{
|
||||
unsigned et;
|
||||
memset_zero_512( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_8WAY( et, ptr );
|
||||
}
|
||||
|
||||
memset_zero_512( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
@@ -970,11 +972,13 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
|
||||
|
||||
// Close
|
||||
|
||||
unsigned et;
|
||||
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_4WAY( et, ptr );
|
||||
if ( ptr )
|
||||
{
|
||||
unsigned et;
|
||||
memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_4WAY( et, ptr );
|
||||
}
|
||||
|
||||
memset_zero_256( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
@@ -1364,11 +1368,13 @@ skein512_2x64_full( skein512_2x64_context *sc, void *out, const void *data,
|
||||
|
||||
// Close
|
||||
|
||||
unsigned et;
|
||||
|
||||
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_2WAY( et, ptr );
|
||||
if ( ptr )
|
||||
{
|
||||
unsigned et;
|
||||
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
et = 352 + ((bcount == 0) << 7);
|
||||
UBI_BIG_2WAY( et, ptr );
|
||||
}
|
||||
|
||||
v128_memset_zero( buf, buf_size >> 3 );
|
||||
bcount = 0;
|
||||
|
||||
@@ -5,19 +5,6 @@
|
||||
|
||||
#if defined(SKEIN_8WAY)
|
||||
|
||||
static __thread skein512_8way_context skein512_8way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void skein2hash_8way( void *output, const void *input )
|
||||
{
|
||||
uint64_t hash[16*8] __attribute__ ((aligned (128)));
|
||||
skein512_8way_context ctx;
|
||||
memcpy( &ctx, &skein512_8way_ctx, sizeof( ctx ) );
|
||||
|
||||
skein512_8way_final16( &ctx, hash, input + (64*8) );
|
||||
skein512_8way_full( &ctx, output, hash, 64 );
|
||||
}
|
||||
|
||||
int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -68,19 +55,6 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined(SKEIN_4WAY)
|
||||
|
||||
static __thread skein512_4way_context skein512_4way_ctx
|
||||
__attribute__ ((aligned (64)));
|
||||
|
||||
void skein2hash_4way( void *output, const void *input )
|
||||
{
|
||||
skein512_4way_context ctx;
|
||||
memcpy( &ctx, &skein512_4way_ctx, sizeof( ctx ) );
|
||||
uint64_t hash[16*4] __attribute__ ((aligned (64)));
|
||||
|
||||
skein512_4way_final16( &ctx, hash, input + (64*4) );
|
||||
skein512_4way_full( &ctx, output, hash, 64 );
|
||||
}
|
||||
|
||||
int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -128,4 +102,53 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SKEIN_2WAY)
|
||||
|
||||
int scanhash_skein2_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[8*2] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
|
||||
skein512_2x64_context ctx;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint64_t *hash_q3 = &(hash[3*2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
v128u64_t *noncev = (v128u64_t*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const v128u64_t two = v128_64( 0x0000000200000000 );
|
||||
|
||||
v128_bswap32_intrlv80_2x64( vdata, pdata );
|
||||
skein512_2x64_prehash64( &ctx, vdata );
|
||||
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
|
||||
|
||||
do
|
||||
{
|
||||
skein512_2x64_final16( &ctx, hash, vdata + (16*2) );
|
||||
skein512_2x64_full( &ctx, hash, hash, 64 );
|
||||
|
||||
for ( int lane = 0; lane < 2; lane++ )
|
||||
if ( hash_q3[ lane ] <= targ_q3 )
|
||||
{
|
||||
extr_lane_2x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = v128_add32( *noncev, two );
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -191,7 +191,7 @@ static void rotate_indexes( uint32_t *p )
|
||||
*(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \
|
||||
*(__m256i*)hash, *(__m256i*)blob_off ), k );
|
||||
|
||||
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
|
||||
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
|
||||
#define MULXOR \
|
||||
casti_v128( hash, 0 ) = v128_mul32( v128_xor( \
|
||||
@@ -251,7 +251,7 @@ void verthash_hash( const void *blob_bytes, const size_t blob_size,
|
||||
/ VH_BYTE_ALIGNMENT ) + 1;
|
||||
#if defined (__AVX2__)
|
||||
const __m256i k = _mm256_set1_epi32( 0x1000193 );
|
||||
#elif defined(__SSE4_1__) // || defined(__ARM_NEON)
|
||||
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
const v128u32_t k = v128_32( 0x1000193 );
|
||||
#endif
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ bool register_verthash_algo( algo_gate_t* gate )
|
||||
{
|
||||
opt_target_factor = 256.0;
|
||||
gate->scanhash = (void*)&scanhash_verthash;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT | NEON_OPT;
|
||||
|
||||
const char *verthash_data_file = opt_data_file ? opt_data_file
|
||||
: default_verthash_data_file;
|
||||
|
||||
@@ -11,7 +11,9 @@
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#endif
|
||||
@@ -31,8 +33,6 @@
|
||||
#else
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
|
||||
// Config
|
||||
#define MINOTAUR_ALGO_COUNT 16
|
||||
@@ -69,11 +69,7 @@ struct TortureGarden
|
||||
cubehashParam cube;
|
||||
shavite512_context shavite;
|
||||
hashState_luffa luffa;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
simd512_context simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_shabal512_context shabal;
|
||||
sph_whirlpool_context whirlpool;
|
||||
@@ -165,13 +161,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
sph_shavite512_close( &garden->shavite, hash );
|
||||
break;
|
||||
case 13:
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &garden->simd );
|
||||
sph_simd512( &garden->simd, input, 64);
|
||||
sph_simd512_close( &garden->simd, hash );
|
||||
#else
|
||||
simd_full( &garden->simd, (BitSequence *)hash, input, 512 );
|
||||
#endif
|
||||
simd512_ctx( &garden->simd, hash, input, 64 );
|
||||
break;
|
||||
case 14:
|
||||
sph_skein512_init( &garden->skein );
|
||||
|
||||
@@ -931,15 +931,19 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
|
||||
// Need sph in some cases
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
//#include "algo/simd/sph_simd.h"
|
||||
//#include "algo/simd/nist.h"
|
||||
#if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#endif
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/haval/sph-haval.h"
|
||||
//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#if !( defined(__AES__) ) //|| defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/echo/sph_echo.h"
|
||||
//#endif
|
||||
#endif
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
union _x17_context_overlay
|
||||
@@ -967,12 +971,8 @@ union _x17_context_overlay
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__x86_64__)
|
||||
simd512_context simd;
|
||||
#else
|
||||
sph_simd512_context simd;
|
||||
#endif
|
||||
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
hamsi_2x64_context hamsi;
|
||||
#else
|
||||
sph_hamsi512_context hamsi;
|
||||
@@ -1033,17 +1033,8 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
|
||||
#if defined(__x86_64__)
|
||||
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
|
||||
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
|
||||
#else
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512( &ctx.simd, hash0, 64 );
|
||||
sph_simd512_close( &ctx.simd, hash0 );
|
||||
sph_simd512_init( &ctx.simd );
|
||||
sph_simd512( &ctx.simd, hash1, 64 );
|
||||
sph_simd512_close( &ctx.simd, hash1 );
|
||||
#endif
|
||||
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
|
||||
@@ -1057,7 +1048,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
|
||||
sph_echo512_close( &ctx.echo, hash1 );
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
intrlv_2x64( vhash, hash0, hash1, 512 );
|
||||
hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 );
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
|
||||
Reference in New Issue
Block a user