This commit is contained in:
Jay D Dee
2023-08-30 20:15:48 -04:00
parent 57a6b7b58b
commit 4378d2f841
72 changed files with 10184 additions and 2182 deletions

View File

@@ -65,9 +65,19 @@ If not what makes it happen or not happen?
Change Log
----------
v3.23.0
#398: Prevent GBT fallback to Getwork on network error.
#398: Prevent excessive logs when conditional mining is paused when mining solo.
Fix a false start if stratum doesn't immediately send a new job after connecting.
Tweak diagonal shuffle in Blake2b & Blake256 1-way SIMD to reduce latency.
CPUID support for AVX10.
Initial changes to AVX2 targeted code in preparation for AVX10.
Code cleanup and miscellaneous small improvements.
v3.22.3
Data interleaving and byte swap optimizations iwith AVX2, AVX512 & AVX512VBMI.
Data interleaving and byte swap optimizations with AVX2, AVX512 & AVX512VBMI.
Faster Luffa with AVX2 & AVX512.
Other small optimizations.
Some code cleanup.

View File

@@ -171,7 +171,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -227,7 +227,7 @@ int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -94,10 +94,13 @@ typedef uint32_t set_t;
#define SSE42_OPT 4
#define AVX_OPT 8 // Sandybridge
#define AVX2_OPT 0x10 // Haswell, Zen1
#define SHA_OPT 0x20 // Zen1, Icelake (sha256)
#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW])
#define VAES_OPT 0x80 // Icelake (VAES & AVX512)
#define SHA_OPT 0x20 // Zen1, Icelake (deprecated)
#define AVX512_OPT 0x40 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
#define VAES_OPT 0x80 // Icelake, Zen3
// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES
// AVX10_256 is compatible with AVX2 + VAES
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }

View File

@@ -308,7 +308,52 @@ static const sph_u32 CS[16] = {
/////////////////////////////////////////
//
// Blake-256 1 way SIMD
// Only used for prehash, otherwise 4way is used with SSE2.
// optimize shuffles to reduce latency caused by dependencies on V1.
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
CSx( r, 5 ) ^ Mx( r, 4 ), \
CSx( r, 3 ) ^ Mx( r, 2 ), \
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
CSx( r, 4 ) ^ Mx( r, 5 ), \
CSx( r, 2 ) ^ Mx( r, 3 ), \
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shufll_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shuflr_32( V2 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
CSx( r, B ) ^ Mx( r, A ), \
CSx( r, 9 ) ^ Mx( r, 8 ), \
CSx( r, F ) ^ Mx( r, E ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
CSx( r, A ) ^ Mx( r, B ), \
CSx( r, 8 ) ^ Mx( r, 9 ), \
CSx( r, E ) ^ Mx( r, F ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shuflr_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shufll_32( V2 ); \
}
/*
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
@@ -350,6 +395,7 @@ static const sph_u32 CS[16] = {
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shufll_32( V1 ); \
}
*/
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 )

View File

@@ -252,14 +252,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
v[12] = m512_const1_64( 0x510E527FADE682D1 );
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
{
size_t i;
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;
@@ -419,14 +419,14 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
v[12] = m256_const1_64( 0x510E527FADE682D1 );
v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
{
size_t i;
ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;

View File

@@ -62,14 +62,14 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
memset( S, 0, sizeof( blake2s_4way_state ) );
S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
m128_const1_64( 0x510E527F510E527FULL ) );
_mm_set1_epi64x( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
m128_const1_64( 0x9B05688C9B05688CULL ) );
_mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
_mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
_mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
@@ -269,21 +269,21 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
memcpy_256( m, block, 16 );
memcpy_256( v, S->h, 8 );
v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
m256_const1_64( 0x510E527F510E527FULL ) );
_mm256_set1_epi64x( 0x510E527F510E527FULL ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
m256_const1_64( 0x9B05688C9B05688CULL ) );
_mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
_mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
_mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
/*
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
@@ -391,14 +391,14 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_8way_state ) );
S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
memcpy_512( m, block, 16 );
memcpy_512( v, S->h, 8 );
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
m512_const1_64( 0x510E527F510E527FULL ) );
_mm512_set1_epi64( 0x510E527F510E527FULL ) );
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
m512_const1_64( 0x9B05688C9B05688CULL ) );
_mm512_set1_epi64( 0x9B05688C9B05688CULL ) );
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
_mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
_mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );
#define G16W( sigma0, sigma1, a, b, c, d) \
@@ -589,14 +589,14 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_16way_state ) );
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );
uint32_t *p = ( uint32_t * )( P );

View File

@@ -64,6 +64,22 @@
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
}
// Pivot about V[1] instead of V[0] reduces latency.
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
V[0] = mm256_shufll_64( V[0] ); \
V[3] = mm256_swap_128( V[3] ); \
V[2] = mm256_shuflr_64( V[2] ); \
BLAKE2B_G( 14, 15, 8, 9, 10, 11, 12, 13 ); \
V[0] = mm256_shuflr_64( V[0] ); \
V[3] = mm256_swap_128( V[3] ); \
V[2] = mm256_shufll_64( V[2] ); \
}
/*
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
@@ -77,6 +93,7 @@
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shufll_64( V[1] ); \
}
*/
#elif defined(__SSE2__)
// always true

View File

@@ -451,22 +451,22 @@ static const __m128i final_s[16] =
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = m128_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m128_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m128_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m128_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m128_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m128_const1_64( 0x6465666764656667 );
ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m128_const1_64( 0x7071727370717273 );
ctx->H[13] = m128_const1_64( 0x7475767774757677 );
ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
@@ -529,7 +529,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
ptr += 4;
h = sc->H;
@@ -959,22 +959,22 @@ static const __m256i final_s8[16] =
void bmw256_8way_init( bmw256_8way_context *ctx )
{
ctx->H[ 0] = m256_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m256_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m256_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m256_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m256_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m256_const1_64( 0x6465666764656667 );
ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m256_const1_64( 0x7071727370717273 );
ctx->H[13] = m256_const1_64( 0x7475767774757677 );
ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = _mm256_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm256_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm256_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm256_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm256_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm256_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm256_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm256_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm256_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm256_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm256_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm256_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm256_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm256_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm256_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm256_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -1030,7 +1030,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
ptr += 4;
h = ctx->H;
@@ -1460,22 +1460,22 @@ static const __m512i final_s16[16] =
void bmw256_16way_init( bmw256_16way_context *ctx )
{
ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m512_const1_64( 0x7071727370717273 );
ctx->H[13] = m512_const1_64( 0x7475767774757677 );
ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = _mm512_set1_epi64( 0x4041424340414243 );
ctx->H[ 1] = _mm512_set1_epi64( 0x4445464744454647 );
ctx->H[ 2] = _mm512_set1_epi64( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm512_set1_epi64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm512_set1_epi64( 0x5051525350515253 );
ctx->H[ 5] = _mm512_set1_epi64( 0x5455565754555657 );
ctx->H[ 6] = _mm512_set1_epi64( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm512_set1_epi64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm512_set1_epi64( 0x6061626360616263 );
ctx->H[ 9] = _mm512_set1_epi64( 0x6465666764656667 );
ctx->H[10] = _mm512_set1_epi64( 0x68696A6B68696A6B );
ctx->H[11] = _mm512_set1_epi64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm512_set1_epi64( 0x7071727370717273 );
ctx->H[13] = _mm512_set1_epi64( 0x7475767774757677 );
ctx->H[14] = _mm512_set1_epi64( 0x78797A7B78797A7B );
ctx->H[15] = _mm512_set1_epi64( 0x7C7D7E7F7C7D7E7F );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -1531,7 +1531,7 @@ void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
ptr += 4;
h = ctx->H;

View File

@@ -896,22 +896,22 @@ static const __m256i final_b[16] =
static void
bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
{
sc->H[ 0] = m256_const1_64( 0x8081828384858687 );
sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F );
sc->H[ 2] = m256_const1_64( 0x9091929394959697 );
sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F );
sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 );
sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF );
sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 );
sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF );
sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 );
sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF );
sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 );
sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF );
sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 );
sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF );
sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 );
sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF );
sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
sc->H[ 2] = _mm256_set1_epi64x( 0x9091929394959697 );
sc->H[ 3] = _mm256_set1_epi64x( 0x98999A9B9C9D9E9F );
sc->H[ 4] = _mm256_set1_epi64x( 0xA0A1A2A3A4A5A6A7 );
sc->H[ 5] = _mm256_set1_epi64x( 0xA8A9AAABACADAEAF );
sc->H[ 6] = _mm256_set1_epi64x( 0xB0B1B2B3B4B5B6B7 );
sc->H[ 7] = _mm256_set1_epi64x( 0xB8B9BABBBCBDBEBF );
sc->H[ 8] = _mm256_set1_epi64x( 0xC0C1C2C3C4C5C6C7 );
sc->H[ 9] = _mm256_set1_epi64x( 0xC8C9CACBCCCDCECF );
sc->H[10] = _mm256_set1_epi64x( 0xD0D1D2D3D4D5D6D7 );
sc->H[11] = _mm256_set1_epi64x( 0xD8D9DADBDCDDDEDF );
sc->H[12] = _mm256_set1_epi64x( 0xE0E1E2E3E4E5E6E7 );
sc->H[13] = _mm256_set1_epi64x( 0xE8E9EAEBECEDEEEF );
sc->H[14] = _mm256_set1_epi64x( 0xF0F1F2F3F4F5F6F7 );
sc->H[15] = _mm256_set1_epi64x( 0xF8F9FAFBFCFDFEFF );
sc->ptr = 0;
sc->bit_count = 0;
}
@@ -967,7 +967,7 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>3 ] = m256_const1_64( 0x80 );
buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
ptr += 8;
h = sc->H;
@@ -1379,22 +1379,22 @@ static const __m512i final_b8[16] =
void bmw512_8way_init( bmw512_8way_context *ctx )
//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
{
ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
ctx->H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
ctx->H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
ctx->H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
ctx->H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
ctx->H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
ctx->H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
ctx->H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
ctx->H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
ctx->H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
ctx->H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
ctx->H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
ctx->H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
ctx->H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
ctx->H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -1448,7 +1448,7 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>3 ] = m512_const1_64( 0x80 );
buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
ptr += 8;
h = ctx->H;
@@ -1483,22 +1483,22 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
// Init
H[ 0] = m512_const1_64( 0x8081828384858687 );
H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
H[ 2] = m512_const1_64( 0x9091929394959697 );
H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );
// Update
@@ -1530,7 +1530,7 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
__m512i h1[16], h2[16];
size_t u, v;
buf[ ptr>>3 ] = m512_const1_64( 0x80 );
buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
ptr += 8;
if ( ptr > (buf_size - 8) )

View File

@@ -423,21 +423,6 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
// 2 way 128
// This isn't expected to be used with AVX512 so HW rotate intruction
// is assumed not avaiable.
// Use double buffering to optimize serial bit rotations. Full double
// buffering isn't practical because it needs twice as many registers
// with AVX2 having only half as many as AVX512.
#define ROL2( out0, out1, in0, in1, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( in0, c ); \
__m256i t1 = _mm256_slli_epi32( in1, c ); \
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
out0 = _mm256_or_si256( out0, t0 ); \
out1 = _mm256_or_si256( out1, t1 ); \
}
static void transform_2way( cube_2way_context *sp )
{
int r;
@@ -460,8 +445,10 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
ROL2( y0, y1, x2, x3, 7 );
ROL2( x2, x3, x0, x1, 7 );
y0 = mm256_rol_32( x2, 7 );
y1 = mm256_rol_32( x3, 7 );
x2 = mm256_rol_32( x0, 7 );
x3 = mm256_rol_32( x1, 7 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( y1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
@@ -474,8 +461,10 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
ROL2( y0, x1, x1, x0, 11 );
ROL2( y1, x3, x3, x2, 11 );
y0 = mm256_rol_32( x1, 11 );
x1 = mm256_rol_32( x0, 11 );
y1 = mm256_rol_32( x3, 11 );
x3 = mm256_rol_32( x2, 11 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( y1, x6 );

View File

@@ -32,7 +32,7 @@ static void transform( cubehashParam *sp )
{
x1 = _mm512_add_epi32( x0, x1 );
x0 = mm512_swap_256( x0 );
x0 = mm512_rol_32( x0, 7 );
x0 = mm512_rol_32( x0, 7 );
x0 = _mm512_xor_si512( x0, x1 );
x1 = mm512_swap128_64( x1 );
x1 = _mm512_add_epi32( x0, x1 );
@@ -58,19 +58,18 @@ static void transform( cubehashParam *sp )
{
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = x0;
x0 = mm256_rol_32( x1, 7 );
x1 = mm256_rol_32( y0, 7 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
y0 = mm256_rol_32( x1, 7 );
y1 = mm256_rol_32( x0, 7 );
x0 = _mm256_xor_si256( y0, x2 );
x1 = _mm256_xor_si256( y1, x3 );
x2 = mm256_swap128_64( x2 );
x3 = mm256_swap128_64( x3 );
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = mm256_swap_128( x0 );
y1 = mm256_swap_128( x1 );
x0 = mm256_rol_32( y0, 11 );
x1 = mm256_rol_32( y1, 11 );
x0 = mm256_swap_128( x0 );
x1 = mm256_swap_128( x1 );
x0 = mm256_rol_32( x0, 11 );
x1 = mm256_rol_32( x1, 11 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = mm256_swap64_32( x2 );
@@ -94,47 +93,48 @@ static void transform( cubehashParam *sp )
x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
x7 = _mm_load_si128( (__m128i*)sp->x + 7 );
for (r = 0; r < rounds; ++r) {
x4 = _mm_add_epi32(x0, x4);
x5 = _mm_add_epi32(x1, x5);
x6 = _mm_add_epi32(x2, x6);
x7 = _mm_add_epi32(x3, x7);
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
x0 = _mm_xor_si128(x0, x4);
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_shuffle_epi32(x4, 0x4e);
x5 = _mm_shuffle_epi32(x5, 0x4e);
x6 = _mm_shuffle_epi32(x6, 0x4e);
x7 = _mm_shuffle_epi32(x7, 0x4e);
x4 = _mm_add_epi32(x0, x4);
x5 = _mm_add_epi32(x1, x5);
x6 = _mm_add_epi32(x2, x6);
x7 = _mm_add_epi32(x3, x7);
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
x0 = _mm_xor_si128(x0, x4);
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_shuffle_epi32(x4, 0xb1);
x5 = _mm_shuffle_epi32(x5, 0xb1);
x6 = _mm_shuffle_epi32(x6, 0xb1);
x7 = _mm_shuffle_epi32(x7, 0xb1);
for ( r = 0; r < rounds; ++r )
{
x4 = _mm_add_epi32( x0, x4 );
x5 = _mm_add_epi32( x1, x5 );
x6 = _mm_add_epi32( x2, x6 );
x7 = _mm_add_epi32( x3, x7 );
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = mm128_rol_32( y0, 7 );
x1 = mm128_rol_32( y1, 7 );
x2 = mm128_rol_32( y2, 7 );
x3 = mm128_rol_32( y3, 7 );
x0 = _mm_xor_si128( x0, x4 );
x1 = _mm_xor_si128( x1, x5 );
x2 = _mm_xor_si128( x2, x6 );
x3 = _mm_xor_si128( x3, x7 );
x4 = _mm_shuffle_epi32( x4, 0x4e );
x5 = _mm_shuffle_epi32( x5, 0x4e );
x6 = _mm_shuffle_epi32( x6, 0x4e );
x7 = _mm_shuffle_epi32( x7, 0x4e );
x4 = _mm_add_epi32( x0, x4 );
x5 = _mm_add_epi32( x1, x5 );
x6 = _mm_add_epi32( x2, x6 );
x7 = _mm_add_epi32( x3, x7 );
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = mm128_rol_32( y0, 11 );
x1 = mm128_rol_32( y1, 11 );
x2 = mm128_rol_32( y2, 11 );
x3 = mm128_rol_32( y3, 11 );
x0 = _mm_xor_si128( x0, x4 );
x1 = _mm_xor_si128( x1, x5 );
x2 = _mm_xor_si128( x2, x6 );
x3 = _mm_xor_si128( x3, x7 );
x4 = _mm_shuffle_epi32( x4, 0xb1 );
x5 = _mm_shuffle_epi32( x5, 0xb1 );
x6 = _mm_shuffle_epi32( x6, 0xb1 );
x7 = _mm_shuffle_epi32( x7, 0xb1 );
}
_mm_store_si128( (__m128i*)sp->x, x0 );
@@ -180,25 +180,25 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
if ( hashbitlen == 512 )
{
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
}
else
{
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
}
return SUCCESS;
@@ -234,10 +234,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
m128_const_64( 0, 0x80 ) );
_mm_set_epi64x( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
transform( sp );
transform( sp );
transform( sp );
@@ -279,10 +279,10 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
m128_const_64( 0, 0x80 ) );
_mm_set_epi64x( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
transform( sp );
transform( sp );
@@ -313,25 +313,25 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
if ( hashbitlen == 512 )
{
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
}
else
{
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
}
@@ -358,10 +358,10 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
m128_const_64( 0, 0x80 ) );
_mm_set_epi64x( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
transform( sp );
transform( sp );

View File

@@ -566,16 +566,16 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
state->uHashSize = 256;
state->uBlockLength = 192;
state->uRounds = 8;
state->hashsize = m128_const_64( 0, 0x100 );
state->const1536 = m128_const_64( 0, 0x600 );
state->hashsize = _mm_set_epi64x( 0, 0x100 );
state->const1536 = _mm_set_epi64x( 0, 0x600 );
break;
case 512:
state->uHashSize = 512;
state->uBlockLength = 128;
state->uRounds = 10;
state->hashsize = m128_const_64( 0, 0x200 );
state->const1536 = m128_const_64( 0, 0x400 );
state->hashsize = _mm_set_epi64x( 0, 0x200 );
state->const1536 = _mm_set_epi64x( 0, 0x400 );
break;
default:

View File

@@ -469,8 +469,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
s2 = _mm256_xor_si256( s2, t2 );\
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
_mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
state2[ 0 ][ j ] = mm256_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -480,8 +479,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
s2 = _mm256_xor_si256( s2, t2 ); \
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
_mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
state2[ 1 ][ j ] = mm256_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -491,8 +489,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
s2 = _mm256_xor_si256( s2, t2 ); \
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
_mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
state2[ 2 ][ j ] = mm256_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
} while(0)

View File

@@ -33,11 +33,11 @@ MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
@@ -131,7 +131,7 @@ MYALIGN const unsigned int _IV512[] = {
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\

View File

@@ -139,7 +139,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
@@ -237,7 +237,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\

View File

@@ -128,7 +128,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
@@ -226,7 +226,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
@@ -275,7 +275,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
b1 = _mm_set_epi64x( 0xffffffffffffffff, 0 ); \
a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
a1 = _mm_xor_si128( a1, b1 ); \
a2 = _mm_xor_si128( a2, b1 ); \

View File

@@ -31,7 +31,7 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
}
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
@@ -48,7 +48,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
@@ -116,7 +116,7 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
else
{
// add first padding
ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
ctx->buffer[rem_ptr] = _mm_set_epi64x( 0, 0x80 );
// add zero padding
for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
@@ -148,7 +148,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
// --- update ---
@@ -182,7 +182,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
else
{
// add first padding
ctx->buffer[i] = m128_const_64( 0, 0x80 );
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
// add zero padding
for ( i += 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
@@ -239,7 +239,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
else
{
// add first padding
ctx->buffer[i] = m128_const_64( 0, 0x80 );
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
// add zero padding
for ( i += 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();

View File

@@ -46,7 +46,7 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
ctx->buffer[i] = _mm_setzero_si128();
}
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
ctx->chaining[ 3 ] = _mm_set_epi64x( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;

View File

@@ -539,7 +539,7 @@ static const __m256i SUBSH_MASK7_2WAY =
j = _mm256_cmpgt_epi8(j, i );\
i = _mm256_add_epi8(i, i);\
j = _mm256_and_si256(j, k);\
i = _mm256_xor_si256(i, j);\
i = mm256_xorand( i, j, k );\
}
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
@@ -550,7 +550,7 @@ static const __m256i SUBSH_MASK7_2WAY =
b0 = a2;\
a1 = _mm256_xor_si256(a1, a2);\
b1 = a3;\
a2 = _mm256_xor_si256(a2, a3);\
TEMP2 = _mm256_xor_si256(a2, a3);\
b2 = a4;\
a3 = _mm256_xor_si256(a3, a4);\
b3 = a5;\
@@ -562,34 +562,20 @@ static const __m256i SUBSH_MASK7_2WAY =
a7 = _mm256_xor_si256(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm256_xor_si256(b0, a4);\
b6 = _mm256_xor_si256(b6, a4);\
b1 = _mm256_xor_si256(b1, a5);\
b7 = _mm256_xor_si256(b7, a5);\
b2 = _mm256_xor_si256(b2, a6);\
b0 = _mm256_xor_si256(b0, a6);\
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm256_xor_si256(b3, a7);\
b1 = _mm256_xor_si256(b1, a7);\
TEMP1 = b1;\
b4 = _mm256_xor_si256(b4, a0);\
b2 = _mm256_xor_si256(b2, a0);\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm256_xor_si256(b5, a1);\
b3 = _mm256_xor_si256(b3, a1);\
b1 = a1;\
b6 = _mm256_xor_si256(b6, a2);\
b4 = _mm256_xor_si256(b4, a2);\
TEMP2 = a2;\
b7 = _mm256_xor_si256(b7, a3);\
b5 = _mm256_xor_si256(b5, a3);\
\
TEMP0 = mm256_xor3( b0, a4, a6 ); \
TEMP1 = mm256_xor3( b1, a5, a7 ); \
b2 = mm256_xor3( b2, a6, a0 ); \
b0 = a0; \
b3 = mm256_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm256_xor3( b6, a4, TEMP2 ); \
b4 = mm256_xor3( b4, a0, TEMP2 ); \
b7 = mm256_xor3( b7, a5, a3 ); \
b5 = mm256_xor3( b5, a1, a3 ); \
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm256_xor_si256(a0, a3);\
a1 = _mm256_xor_si256(a1, a4);\
a2 = _mm256_xor_si256(a2, a5);\
a2 = _mm256_xor_si256( TEMP2, a5);\
a3 = _mm256_xor_si256(a3, a6);\
a4 = _mm256_xor_si256(a4, a7);\
a5 = _mm256_xor_si256(a5, b0);\
@@ -671,7 +657,6 @@ static const __m256i SUBSH_MASK7_2WAY =
\
/* MixBytes */\
MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
\
}
/* 10 rounds, P and Q in parallel */

View File

@@ -710,7 +710,7 @@ static const __m256i SUBSH_MASK7_2WAY =
b0 = a2;\
a1 = _mm256_xor_si256(a1, a2);\
b1 = a3;\
a2 = _mm256_xor_si256(a2, a3);\
TEMP2 = _mm256_xor_si256(a2, a3);\
b2 = a4;\
a3 = _mm256_xor_si256(a3, a4);\
b3 = a5;\
@@ -722,34 +722,23 @@ static const __m256i SUBSH_MASK7_2WAY =
a7 = _mm256_xor_si256(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm256_xor_si256(b0, a4);\
b6 = _mm256_xor_si256(b6, a4);\
b1 = _mm256_xor_si256(b1, a5);\
b7 = _mm256_xor_si256(b7, a5);\
b2 = _mm256_xor_si256(b2, a6);\
b0 = _mm256_xor_si256(b0, a6);\
TEMP0 = mm256_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm256_xor_si256(b3, a7);\
b1 = _mm256_xor_si256(b1, a7);\
TEMP1 = b1;\
b4 = _mm256_xor_si256(b4, a0);\
b2 = _mm256_xor_si256(b2, a0);\
TEMP1 = mm256_xor3( b1, a5, a7 ); \
b2 = mm256_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm256_xor_si256(b5, a1);\
b3 = _mm256_xor_si256(b3, a1);\
b1 = a1;\
b6 = _mm256_xor_si256(b6, a2);\
b4 = _mm256_xor_si256(b4, a2);\
TEMP2 = a2;\
b7 = _mm256_xor_si256(b7, a3);\
b5 = _mm256_xor_si256(b5, a3);\
b0 = a0; \
b3 = mm256_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm256_xor3( b6, a4, TEMP2 ); \
b4 = mm256_xor3( b4, a0, TEMP2 ); \
b7 = mm256_xor3( b7, a5, a3 ); \
b5 = mm256_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm256_xor_si256(a0, a3);\
a1 = _mm256_xor_si256(a1, a4);\
a2 = _mm256_xor_si256(a2, a5);\
a2 = _mm256_xor_si256( TEMP2, a5);\
a3 = _mm256_xor_si256(a3, a6);\
a4 = _mm256_xor_si256(a4, a7);\
a5 = _mm256_xor_si256(a5, b0);\

View File

@@ -562,14 +562,14 @@ do { \
for ( int u = 0; u < 64; u++ ) \
{ \
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, _mm512_set1_epi64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, _mm512_set1_epi64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, _mm512_set1_epi64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, _mm512_set1_epi64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, _mm512_set1_epi64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, _mm512_set1_epi64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, _mm512_set1_epi64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, _mm512_set1_epi64( tp[7] ) ); \
db = _mm512_ror_epi64( db, 1 ); \
tp += 8; \
} \
@@ -733,17 +733,17 @@ do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
@@ -752,29 +752,29 @@ do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (10ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (11ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
@@ -829,14 +829,14 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
sc->h[0] = m512_const1_64( 0x6c70617273746565 );
sc->h[1] = m512_const1_64( 0x656e62656b204172 );
sc->h[2] = m512_const1_64( 0x302c206272672031 );
sc->h[3] = m512_const1_64( 0x3434362c75732032 );
sc->h[4] = m512_const1_64( 0x3030312020422d33 );
sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
sc->h[7] = m512_const1_64( 0x6769756d2042656c );
sc->h[0] = _mm512_set1_epi64( 0x6c70617273746565 );
sc->h[1] = _mm512_set1_epi64( 0x656e62656b204172 );
sc->h[2] = _mm512_set1_epi64( 0x302c206272672031 );
sc->h[3] = _mm512_set1_epi64( 0x3434362c75732032 );
sc->h[4] = _mm512_set1_epi64( 0x3030312020422d33 );
sc->h[5] = _mm512_set1_epi64( 0x656e2d484c657576 );
sc->h[6] = _mm512_set1_epi64( 0x6c65652c65766572 );
sc->h[7] = _mm512_set1_epi64( 0x6769756d2042656c );
}
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
@@ -859,7 +859,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
sc->buf[0] = m512_const1_64( 0x80 );
sc->buf[0] = _mm512_set1_epi64( 0x80 );
hamsi_8way_big( sc, sc->buf, 1 );
hamsi_8way_big_final( sc, pad );
@@ -870,6 +870,32 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
// Hamsi 4 way AVX2
#if defined(__AVX512VL__)
#define INPUT_BIG \
do { \
__m256i db = _mm256_ror_epi64( *buf, 1 ); \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \
m0 = _mm256_mask_xor_epi64( m0, dm, m0, _mm256_set1_epi64x( tp[0] ) ); \
m1 = _mm256_mask_xor_epi64( m1, dm, m1, _mm256_set1_epi64x( tp[1] ) ); \
m2 = _mm256_mask_xor_epi64( m2, dm, m2, _mm256_set1_epi64x( tp[2] ) ); \
m3 = _mm256_mask_xor_epi64( m3, dm, m3, _mm256_set1_epi64x( tp[3] ) ); \
m4 = _mm256_mask_xor_epi64( m4, dm, m4, _mm256_set1_epi64x( tp[4] ) ); \
m5 = _mm256_mask_xor_epi64( m5, dm, m5, _mm256_set1_epi64x( tp[5] ) ); \
m6 = _mm256_mask_xor_epi64( m6, dm, m6, _mm256_set1_epi64x( tp[6] ) ); \
m7 = _mm256_mask_xor_epi64( m7, dm, m7, _mm256_set1_epi64x( tp[7] ) ); \
db = _mm256_ror_epi64( db, 1 ); \
tp += 8; \
} \
} while (0)
#else
#define INPUT_BIG \
do { \
__m256i db = *buf; \
@@ -880,25 +906,58 @@ do { \
{ \
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
m256_const1_64( tp[0] ) ) ); \
_mm256_set1_epi64x( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
m256_const1_64( tp[1] ) ) ); \
_mm256_set1_epi64x( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
m256_const1_64( tp[2] ) ) ); \
_mm256_set1_epi64x( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
m256_const1_64( tp[3] ) ) ); \
_mm256_set1_epi64x( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
m256_const1_64( tp[4] ) ) ); \
_mm256_set1_epi64x( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
m256_const1_64( tp[5] ) ) ); \
_mm256_set1_epi64x( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
m256_const1_64( tp[6] ) ) ); \
_mm256_set1_epi64x( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
m256_const1_64( tp[7] ) ) ); \
_mm256_set1_epi64x( tp[7] ) ) ); \
tp += 8; \
} \
} while (0)
#endif
#define SBOX( a, b, c, d ) \
do { \
__m256i t; \
t = a; \
a = mm256_xorand( d, a, c ); \
c = mm256_xor3( a, b, c ); \
b = mm256_xoror( b, d, t ); \
t = _mm256_xor_si256( t, c ); \
d = mm256_xoror( a, b, t ); \
t = mm256_xorand( t, a, b ); \
a = c; \
c = mm256_xor3( b, d, t ); \
b = d; \
d = mm256_not( t ); \
} while (0)
#define L( a, b, c, d ) \
do { \
a = mm256_rol_32( a, 13 ); \
c = mm256_rol_32( c, 3 ); \
b = mm256_xor3( a, b, c ); \
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
b = mm256_rol_32( b, 1 ); \
d = mm256_rol_32( d, 7 ); \
a = mm256_xor3( a, b, d ); \
c = mm256_xor3( c, d, _mm256_slli_epi32( b, 7 ) ); \
a = mm256_rol_32( a, 5 ); \
c = mm256_rol_32( c, 22 ); \
} while (0)
/*
#define SBOX( a, b, c, d ) \
do { \
__m256i t; \
@@ -937,6 +996,7 @@ do { \
a = mm256_rol_32( a, 5 ); \
c = mm256_rol_32( c, 22 ); \
} while (0)
*/
#define DECL_STATE_BIG \
__m256i c0, c1, c2, c3, c4, c5, c6, c7; \
@@ -1066,17 +1126,17 @@ do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
@@ -1085,29 +1145,29 @@ do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (10ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (11ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
@@ -1163,14 +1223,14 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
sc->h[0] = m256_const1_64( 0x6c70617273746565 );
sc->h[1] = m256_const1_64( 0x656e62656b204172 );
sc->h[2] = m256_const1_64( 0x302c206272672031 );
sc->h[3] = m256_const1_64( 0x3434362c75732032 );
sc->h[4] = m256_const1_64( 0x3030312020422d33 );
sc->h[5] = m256_const1_64( 0x656e2d484c657576 );
sc->h[6] = m256_const1_64( 0x6c65652c65766572 );
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
sc->h[0] = _mm256_set1_epi64x( 0x6c70617273746565 );
sc->h[1] = _mm256_set1_epi64x( 0x656e62656b204172 );
sc->h[2] = _mm256_set1_epi64x( 0x302c206272672031 );
sc->h[3] = _mm256_set1_epi64x( 0x3434362c75732032 );
sc->h[4] = _mm256_set1_epi64x( 0x3030312020422d33 );
sc->h[5] = _mm256_set1_epi64x( 0x656e2d484c657576 );
sc->h[6] = _mm256_set1_epi64x( 0x6c65652c65766572 );
sc->h[7] = _mm256_set1_epi64x( 0x6769756d2042656c );
}
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
@@ -1193,7 +1253,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch );
sc->buf[0] = m256_const1_64( 0x80 );
sc->buf[0] = _mm256_set1_epi64x( 0x80 );
hamsi_big( sc, sc->buf, 1 );
hamsi_big_final( sc, pad );

View File

@@ -52,6 +52,56 @@ extern "C"{
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#if defined(__AVX512VL__)
// ( ~( a ^ b ) ) & c
#define mm128_andnotxor( a, b, c ) \
_mm_ternarylogic_epi32( a, b, c, 0x82 )
#else
#define mm128_andnotxor( a, b, c ) \
_mm_andnot_si128( _mm_xor_si128( a, b ), c )
#endif
#define F1(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) ) \
#define F2(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 ) ), \
mm128_andxor( x4, x1, x5 ), \
mm128_xorand( x0, x3, x5 ) ) \
#define F3(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, \
_mm_and_si128( x3, \
mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
_mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ) )
#define F4(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( \
mm128_andxor( x3, x5, \
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_or_si128( x4, x6 ) ) ), \
_mm_and_si128( x4, \
mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
_mm_xor_si128( x1, x6 ) ) ), \
mm128_xorand( x0, x2, x6 ) )
#define F5(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
mm128_xor3( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) )
/*
#define F1(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( x0, \
_mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
@@ -96,6 +146,7 @@ extern "C"{
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), \
_mm_and_si128( x3, x6 ) ) )
*/
/*
* The macros below integrate the phi() permutations, depending on the
@@ -740,14 +791,14 @@ do { \
static void
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = m256_const1_32( 0x243F6A88UL );
sc->s1 = m256_const1_32( 0x85A308D3UL );
sc->s2 = m256_const1_32( 0x13198A2EUL );
sc->s3 = m256_const1_32( 0x03707344UL );
sc->s4 = m256_const1_32( 0xA4093822UL );
sc->s5 = m256_const1_32( 0x299F31D0UL );
sc->s6 = m256_const1_32( 0x082EFA98UL );
sc->s7 = m256_const1_32( 0xEC4E6C89UL );
sc->s0 = _mm256_set1_epi32( 0x243F6A88UL );
sc->s1 = _mm256_set1_epi32( 0x85A308D3UL );
sc->s2 = _mm256_set1_epi32( 0x13198A2EUL );
sc->s3 = _mm256_set1_epi32( 0x03707344UL );
sc->s4 = _mm256_set1_epi32( 0xA4093822UL );
sc->s5 = _mm256_set1_epi32( 0x299F31D0UL );
sc->s6 = _mm256_set1_epi32( 0x082EFA98UL );
sc->s7 = _mm256_set1_epi32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;

View File

@@ -76,19 +76,31 @@ do { \
#endif
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#define notxorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
#else
#define notxorandnot( a, b, c ) \
_mm256_xor_si256( mm256_not( a ), _mm256_andnot_si256( b, c ) )
#endif
#define Sb(x0, x1, x2, x3, c) \
do { \
const __m256i cc = _mm256_set1_epi64x( c ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
const __m256i cc = _mm256_set1_epi64x( c ); \
x0 = mm256_xorandnot( x0, x2, cc ); \
tmp = mm256_xorand( cc, x0, x1 ); \
x0 = mm256_xorandnot( x0, x3, x2 ); \
x3 = notxorandnot( x3, x1, x2 ); \
x1 = mm256_xorand( x1, x0, x2 ); \
x2 = mm256_xorandnot( x2, x3, x0 ); \
x0 = mm256_xoror( x0, x1, x3 ); \
x3 = mm256_xorand( x3, x1, x2 ); \
x1 = mm256_xorand( x1, tmp, x0 ); \
x2 = _mm256_xor_si256( x2, tmp ); \
} while (0)
@@ -96,11 +108,11 @@ do { \
do { \
x4 = _mm256_xor_si256( x4, x1 ); \
x5 = _mm256_xor_si256( x5, x2 ); \
x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
x6 = mm256_xor3( x6, x3, x0 ); \
x7 = _mm256_xor_si256( x7, x0 ); \
x0 = _mm256_xor_si256( x0, x5 ); \
x1 = _mm256_xor_si256( x1, x6 ); \
x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
x2 = mm256_xor3( x2, x7, x4 ); \
x3 = _mm256_xor_si256( x3, x4 ); \
} while (0)
@@ -323,12 +335,12 @@ do { \
} while (0)
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W83(x) Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ), 8 )
#define W84(x) Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
#define W85(x) Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
#define W80(x) Wz_8W(x, _mm512_set1_epi64( 0x5555555555555555 ), 1 )
#define W81(x) Wz_8W(x, _mm512_set1_epi64( 0x3333333333333333 ), 2 )
#define W82(x) Wz_8W(x, _mm512_set1_epi64( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W83(x) Wz_8W(x, _mm512_set1_epi64( 0x00FF00FF00FF00FF ), 8 )
#define W84(x) Wz_8W(x, _mm512_set1_epi64( 0x0000FFFF0000FFFF ), 16 )
#define W85(x) Wz_8W(x, _mm512_set1_epi64( 0x00000000FFFFFFFF ), 32 )
#define W86(x) \
do { \
__m512i t = x ## h; \
@@ -352,12 +364,12 @@ do { \
x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
} while (0)
#define W0(x) Wz(x, m256_const1_64( 0x5555555555555555 ), 1 )
#define W1(x) Wz(x, m256_const1_64( 0x3333333333333333 ), 2 )
#define W2(x) Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W3(x) Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ), 8 )
#define W4(x) Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 )
#define W5(x) Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 )
#define W0(x) Wz(x, _mm256_set1_epi64x( 0x5555555555555555 ), 1 )
#define W1(x) Wz(x, _mm256_set1_epi64x( 0x3333333333333333 ), 2 )
#define W2(x) Wz(x, _mm256_set1_epi64x( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W3(x) Wz(x, _mm256_set1_epi64x( 0x00FF00FF00FF00FF ), 8 )
#define W4(x) Wz(x, _mm256_set1_epi64x( 0x0000FFFF0000FFFF ), 16 )
#define W5(x) Wz(x, _mm256_set1_epi64x( 0x00000000FFFFFFFF ), 32 )
#define W6(x) \
do { \
__m256i t = x ## h; \
@@ -624,22 +636,22 @@ static const sph_u64 IV512[] = {
void jh256_8way_init( jh_8way_context *sc )
{
// bswapped IV256
sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
sc->H[ 0] = _mm512_set1_epi64( 0xebd3202c41a398eb );
sc->H[ 1] = _mm512_set1_epi64( 0xc145b29c7bbecd92 );
sc->H[ 2] = _mm512_set1_epi64( 0xfac7d4609151931c );
sc->H[ 3] = _mm512_set1_epi64( 0x038a507ed6820026 );
sc->H[ 4] = _mm512_set1_epi64( 0x45b92677269e23a4 );
sc->H[ 5] = _mm512_set1_epi64( 0x77941ad4481afbe0 );
sc->H[ 6] = _mm512_set1_epi64( 0x7a176b0226abb5cd );
sc->H[ 7] = _mm512_set1_epi64( 0xa82fff0f4224f056 );
sc->H[ 8] = _mm512_set1_epi64( 0x754d2e7f8996a371 );
sc->H[ 9] = _mm512_set1_epi64( 0x62e27df70849141d );
sc->H[10] = _mm512_set1_epi64( 0x948f2476f7957627 );
sc->H[11] = _mm512_set1_epi64( 0x6c29804757b6d587 );
sc->H[12] = _mm512_set1_epi64( 0x6c0d8eac2d275e5c );
sc->H[13] = _mm512_set1_epi64( 0x0f7a0557c6508451 );
sc->H[14] = _mm512_set1_epi64( 0xea12247067d3e47b );
sc->H[15] = _mm512_set1_epi64( 0x69d71cd313abe389 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -647,22 +659,22 @@ void jh256_8way_init( jh_8way_context *sc )
void jh512_8way_init( jh_8way_context *sc )
{
// bswapped IV512
sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
sc->H[ 0] = _mm512_set1_epi64( 0x17aa003e964bd16f );
sc->H[ 1] = _mm512_set1_epi64( 0x43d5157a052e6a63 );
sc->H[ 2] = _mm512_set1_epi64( 0x0bef970c8d5e228a );
sc->H[ 3] = _mm512_set1_epi64( 0x61c3b3f2591234e9 );
sc->H[ 4] = _mm512_set1_epi64( 0x1e806f53c1a01d89 );
sc->H[ 5] = _mm512_set1_epi64( 0x806d2bea6b05a92a );
sc->H[ 6] = _mm512_set1_epi64( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = _mm512_set1_epi64( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = _mm512_set1_epi64( 0x694ae34105e66901 );
sc->H[ 9] = _mm512_set1_epi64( 0x5ae66f2e8e8ab546 );
sc->H[10] = _mm512_set1_epi64( 0x243c84c1d0a74710 );
sc->H[11] = _mm512_set1_epi64( 0x99c15a2db1716e3b );
sc->H[12] = _mm512_set1_epi64( 0x56f8b19decf657cf );
sc->H[13] = _mm512_set1_epi64( 0x56b116577c8806a7 );
sc->H[14] = _mm512_set1_epi64( 0xfb1785e6dffcc2e3 );
sc->H[15] = _mm512_set1_epi64( 0x4bdd8ccc78465a54 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -721,7 +733,7 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
size_t numz, u;
uint64_t l0, l1;
buf[0] = m512_const1_64( 0x80ULL );
buf[0] = _mm512_set1_epi64( 0x80ULL );
if ( sc->ptr == 0 )
numz = 48;
@@ -772,22 +784,22 @@ jh512_8way_close(void *cc, void *dst)
void jh256_4way_init( jh_4way_context *sc )
{
// bswapped IV256
sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb );
sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 );
sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c );
sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 );
sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 );
sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 );
sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd );
sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 );
sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 );
sc->H[ 9] = m256_const1_64( 0x62e27df70849141d );
sc->H[10] = m256_const1_64( 0x948f2476f7957627 );
sc->H[11] = m256_const1_64( 0x6c29804757b6d587 );
sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c );
sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 );
sc->H[14] = m256_const1_64( 0xea12247067d3e47b );
sc->H[15] = m256_const1_64( 0x69d71cd313abe389 );
sc->H[ 0] = _mm256_set1_epi64x( 0xebd3202c41a398eb );
sc->H[ 1] = _mm256_set1_epi64x( 0xc145b29c7bbecd92 );
sc->H[ 2] = _mm256_set1_epi64x( 0xfac7d4609151931c );
sc->H[ 3] = _mm256_set1_epi64x( 0x038a507ed6820026 );
sc->H[ 4] = _mm256_set1_epi64x( 0x45b92677269e23a4 );
sc->H[ 5] = _mm256_set1_epi64x( 0x77941ad4481afbe0 );
sc->H[ 6] = _mm256_set1_epi64x( 0x7a176b0226abb5cd );
sc->H[ 7] = _mm256_set1_epi64x( 0xa82fff0f4224f056 );
sc->H[ 8] = _mm256_set1_epi64x( 0x754d2e7f8996a371 );
sc->H[ 9] = _mm256_set1_epi64x( 0x62e27df70849141d );
sc->H[10] = _mm256_set1_epi64x( 0x948f2476f7957627 );
sc->H[11] = _mm256_set1_epi64x( 0x6c29804757b6d587 );
sc->H[12] = _mm256_set1_epi64x( 0x6c0d8eac2d275e5c );
sc->H[13] = _mm256_set1_epi64x( 0x0f7a0557c6508451 );
sc->H[14] = _mm256_set1_epi64x( 0xea12247067d3e47b );
sc->H[15] = _mm256_set1_epi64x( 0x69d71cd313abe389 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -795,22 +807,22 @@ void jh256_4way_init( jh_4way_context *sc )
void jh512_4way_init( jh_4way_context *sc )
{
// bswapped IV512
sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f );
sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 );
sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a );
sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 );
sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 );
sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a );
sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 );
sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 );
sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 );
sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b );
sc->H[12] = m256_const1_64( 0x56f8b19decf657cf );
sc->H[13] = m256_const1_64( 0x56b116577c8806a7 );
sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 );
sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 );
sc->H[ 0] = _mm256_set1_epi64x( 0x17aa003e964bd16f );
sc->H[ 1] = _mm256_set1_epi64x( 0x43d5157a052e6a63 );
sc->H[ 2] = _mm256_set1_epi64x( 0x0bef970c8d5e228a );
sc->H[ 3] = _mm256_set1_epi64x( 0x61c3b3f2591234e9 );
sc->H[ 4] = _mm256_set1_epi64x( 0x1e806f53c1a01d89 );
sc->H[ 5] = _mm256_set1_epi64x( 0x806d2bea6b05a92a );
sc->H[ 6] = _mm256_set1_epi64x( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = _mm256_set1_epi64x( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = _mm256_set1_epi64x( 0x694ae34105e66901 );
sc->H[ 9] = _mm256_set1_epi64x( 0x5ae66f2e8e8ab546 );
sc->H[10] = _mm256_set1_epi64x( 0x243c84c1d0a74710 );
sc->H[11] = _mm256_set1_epi64x( 0x99c15a2db1716e3b );
sc->H[12] = _mm256_set1_epi64x( 0x56f8b19decf657cf );
sc->H[13] = _mm256_set1_epi64x( 0x56b116577c8806a7 );
sc->H[14] = _mm256_set1_epi64x( 0xfb1785e6dffcc2e3 );
sc->H[15] = _mm256_set1_epi64x( 0x4bdd8ccc78465a54 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -869,7 +881,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
size_t numz, u;
uint64_t l0, l1;
buf[0] = m256_const1_64( 0x80ULL );
buf[0] = _mm256_set1_epi64x( 0x80ULL );
if ( sc->ptr == 0 )
numz = 48;

View File

@@ -49,7 +49,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
@@ -101,7 +101,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
pdata[19] = n;

View File

@@ -180,15 +180,15 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = m512_const1_64( t );
u.tmp[0] = _mm512_set1_epi64( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = m512_const1_64( eb );
u.tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
}
keccak64_8way_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */
@@ -264,8 +264,8 @@ keccak512_8way_close(void *cc, void *dst)
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = mm256_not( s ) )
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
#define XOROR(d, a, b, c) (d = mm256_xoror( a, b, c ) )
#define XORAND(d, a, b, c) (d = mm256_xorand( a, b, c ) )
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -368,15 +368,15 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = m256_const1_64( t );
u.tmp[0] = _mm256_set1_epi64x( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = m256_const1_64( eb );
u.tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */

View File

@@ -56,7 +56,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
@@ -115,7 +115,7 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -69,7 +69,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
#define MULT24W( a0, a1 ) \
{ \
__m512i b = _mm512_xor_si512( a0, \
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 0x10 ) ); \
a0 = _mm512_alignr_epi8( a1, b, 4 ); \
a1 = _mm512_alignr_epi8( b, a1, 4 ); \
}
@@ -107,49 +107,37 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
ADD_CONSTANT4W( x0, x4, c0, c1 );
#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
a1 = _mm512_shuffle_epi32( a1, 147 ); \
t0 = _mm512_load_si512( &a1 ); \
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
t0 = _mm512_shuffle_epi32( a1, 147 ); \
a1 = _mm512_unpacklo_epi32( t0, a0 ); \
t0 = _mm512_unpackhi_epi32( t0, a0 ); \
t1 = _mm512_shuffle_epi32( t0, 78 ); \
a0 = _mm512_shuffle_epi32( a1, 78 ); \
SUBCRUMB4W( t1, t0, a0, a1 ); \
t0 = _mm512_unpacklo_epi32( t0, t1 ); \
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
a0 = _mm512_load_si512( &a1 ); \
a0 = _mm512_unpackhi_epi64( a0, t0 ); \
a0 = _mm512_unpackhi_epi64( a1, t0 ); \
a1 = _mm512_unpacklo_epi64( a1, t0 ); \
a1 = _mm512_shuffle_epi32( a1, 57 ); \
MIXWORD4W( a0, a1 ); \
ADD_CONSTANT4W( a0, a1, c0, c1 );
#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm512_load_si512(&r3);\
q1 = _mm512_load_si512(&p3);\
s3 = _mm512_load_si512(&r3);\
q3 = _mm512_load_si512(&p3);\
s1 = _mm512_unpackhi_epi32(s1,r2);\
q1 = _mm512_unpackhi_epi32(q1,p2);\
s3 = _mm512_unpacklo_epi32(s3,r2);\
q3 = _mm512_unpacklo_epi32(q3,p2);\
s0 = _mm512_load_si512(&s1);\
q0 = _mm512_load_si512(&q1);\
s2 = _mm512_load_si512(&s3);\
q2 = _mm512_load_si512(&q3);\
r3 = _mm512_load_si512(&r1);\
p3 = _mm512_load_si512(&p1);\
r1 = _mm512_unpacklo_epi32(r1,r0);\
p1 = _mm512_unpacklo_epi32(p1,p0);\
r3 = _mm512_unpackhi_epi32(r3,r0);\
p3 = _mm512_unpackhi_epi32(p3,p0);\
s0 = _mm512_unpackhi_epi64(s0,r3);\
q0 = _mm512_unpackhi_epi64(q0,p3);\
s1 = _mm512_unpacklo_epi64(s1,r3);\
q1 = _mm512_unpacklo_epi64(q1,p3);\
s2 = _mm512_unpackhi_epi64(s2,r1);\
q2 = _mm512_unpackhi_epi64(q2,p1);\
s3 = _mm512_unpacklo_epi64(s3,r1);\
q3 = _mm512_unpacklo_epi64(q3,p1);
s1 = _mm512_unpackhi_epi32( r3, r2 ); \
q1 = _mm512_unpackhi_epi32( p3, p2 ); \
s3 = _mm512_unpacklo_epi32( r3, r2 ); \
q3 = _mm512_unpacklo_epi32( p3, p2 ); \
r3 = _mm512_unpackhi_epi32( r1, r0 ); \
r1 = _mm512_unpacklo_epi32( r1, r0 ); \
p3 = _mm512_unpackhi_epi32( p1, p0 ); \
p1 = _mm512_unpacklo_epi32( p1, p0 ); \
s0 = _mm512_unpackhi_epi64( s1, r3 ); \
q0 = _mm512_unpackhi_epi64( q1 ,p3 ); \
s1 = _mm512_unpacklo_epi64( s1, r3 ); \
q1 = _mm512_unpacklo_epi64( q1, p3 ); \
s2 = _mm512_unpackhi_epi64( s3, r1 ); \
q2 = _mm512_unpackhi_epi64( q3, p1 ); \
s3 = _mm512_unpacklo_epi64( s3, r1 ); \
q3 = _mm512_unpacklo_epi64( q3, p1 );
#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -198,11 +186,8 @@ void rnd512_4way( luffa_4way_context *state, const __m512i *msg )
chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
MULT24W( chainv[8], chainv[9] );
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
t0 = chainv[8] = _mm512_xor_si512( chainv[8], t0 );
t1 = chainv[9] = _mm512_xor_si512( chainv[9], t1 );
MULT24W( chainv[8], chainv[9] );
chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
@@ -538,10 +523,39 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
#if defined(__AVX512VL__)
#define MULT2( a0, a1 ) \
{ \
__m256i b = _mm256_xor_si256( a0, \
_mm256_maskz_shuffle_epi32( 0xbb, a1, 0x10 ) ); \
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
}
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m256i t = a0; \
a0 = mm256_xoror( a3, a0, a1 ); \
a2 = _mm256_xor_si256( a2, a3 ); \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm256_xorand( a2, a3, t ); \
a2 = mm256_xorand( a1, a2, a0); \
a1 = _mm256_or_si256( a1, a3 ); \
a3 = _mm256_xor_si256( a3, a2 ); \
t = _mm256_xor_si256( t, a1 ); \
a2 = _mm256_and_si256( a2, a1 ); \
a1 = mm256_xnor( a1, a0 ); \
a0 = t; \
}
#else
#define MULT2( a0, a1 ) \
{ \
__m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
_mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
_mm256_blend_epi32( a1, m256_zero, 0xee ), 0x10 ) ); \
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
}
@@ -567,26 +581,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
a0 = t; \
}
#endif
#define MIXWORD( a, b ) \
{ \
__m256i t1, t2; \
b = _mm256_xor_si256( a,b ); \
t1 = _mm256_slli_epi32( a, 2 ); \
t2 = _mm256_srli_epi32( a, 30 ); \
a = _mm256_or_si256( t1, t2 ); \
a = _mm256_xor_si256( a, b ); \
t1 = _mm256_slli_epi32( b, 14 ); \
t2 = _mm256_srli_epi32( b, 18 ); \
b = _mm256_or_si256( t1, t2 ); \
b = _mm256_xor_si256( a, b ); \
t1 = _mm256_slli_epi32( a, 10 ); \
t2 = _mm256_srli_epi32( a, 22 ); \
a = _mm256_or_si256( t1,t2 ); \
a = _mm256_xor_si256( a,b ); \
t1 = _mm256_slli_epi32( b,1 ); \
t2 = _mm256_srli_epi32( b,31 ); \
b = _mm256_or_si256( t1, t2 ); \
}
b = _mm256_xor_si256( a, b ); \
a = _mm256_xor_si256( b, mm256_rol_32( a, 2 ) ); \
b = _mm256_xor_si256( a, mm256_rol_32( b, 14 ) ); \
a = _mm256_xor_si256( b, mm256_rol_32( a, 10 ) ); \
b = mm256_rol_32( b, 1 );
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
SUBCRUMB( x0, x1, x2, x3 ); \
@@ -598,49 +600,37 @@ int luffa_4way_update_close( luffa_4way_context *state,
ADD_CONSTANT( x0, x4, c0, c1 );
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
a1 = _mm256_shuffle_epi32( a1, 147); \
t0 = _mm256_load_si256( &a1 ); \
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
t0 = _mm256_shuffle_epi32( a1, 147 ); \
a1 = _mm256_unpacklo_epi32( t0, a0 ); \
t0 = _mm256_unpackhi_epi32( t0, a0 ); \
t1 = _mm256_shuffle_epi32( t0, 78 ); \
a0 = _mm256_shuffle_epi32( a1, 78 ); \
SUBCRUMB( t1, t0, a0, a1 );\
SUBCRUMB( t1, t0, a0, a1 ); \
t0 = _mm256_unpacklo_epi32( t0, t1 ); \
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
a0 = _mm256_load_si256( &a1 ); \
a0 = _mm256_unpackhi_epi64( a0, t0 ); \
a0 = _mm256_unpackhi_epi64( a1, t0 ); \
a1 = _mm256_unpacklo_epi64( a1, t0 ); \
a1 = _mm256_shuffle_epi32( a1, 57 ); \
MIXWORD( a0, a1 ); \
ADD_CONSTANT( a0, a1, c0, c1 );
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm256_load_si256(&r3);\
q1 = _mm256_load_si256(&p3);\
s3 = _mm256_load_si256(&r3);\
q3 = _mm256_load_si256(&p3);\
s1 = _mm256_unpackhi_epi32(s1,r2);\
q1 = _mm256_unpackhi_epi32(q1,p2);\
s3 = _mm256_unpacklo_epi32(s3,r2);\
q3 = _mm256_unpacklo_epi32(q3,p2);\
s0 = _mm256_load_si256(&s1);\
q0 = _mm256_load_si256(&q1);\
s2 = _mm256_load_si256(&s3);\
q2 = _mm256_load_si256(&q3);\
r3 = _mm256_load_si256(&r1);\
p3 = _mm256_load_si256(&p1);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
r3 = _mm256_unpackhi_epi32(r3,r0);\
p3 = _mm256_unpackhi_epi32(p3,p0);\
s0 = _mm256_unpackhi_epi64(s0,r3);\
q0 = _mm256_unpackhi_epi64(q0,p3);\
s1 = _mm256_unpacklo_epi64(s1,r3);\
q1 = _mm256_unpacklo_epi64(q1,p3);\
s2 = _mm256_unpackhi_epi64(s2,r1);\
q2 = _mm256_unpackhi_epi64(q2,p1);\
s3 = _mm256_unpacklo_epi64(s3,r1);\
q3 = _mm256_unpacklo_epi64(q3,p1);
s1 = _mm256_unpackhi_epi32( r3, r2 ); \
q1 = _mm256_unpackhi_epi32( p3, p2 ); \
s3 = _mm256_unpacklo_epi32( r3, r2 ); \
q3 = _mm256_unpacklo_epi32( p3, p2 ); \
r3 = _mm256_unpackhi_epi32( r1, r0 ); \
r1 = _mm256_unpacklo_epi32( r1, r0 ); \
p3 = _mm256_unpackhi_epi32( p1, p0 ); \
p1 = _mm256_unpacklo_epi32( p1, p0 ); \
s0 = _mm256_unpackhi_epi64( s1, r3 ); \
q0 = _mm256_unpackhi_epi64( q1 ,p3 ); \
s1 = _mm256_unpacklo_epi64( s1, r3 ); \
q1 = _mm256_unpacklo_epi64( q1, p3 ); \
s2 = _mm256_unpackhi_epi64( s3, r1 ); \
q2 = _mm256_unpackhi_epi64( q3, p1 ); \
s3 = _mm256_unpacklo_epi64( s3, r1 ); \
q3 = _mm256_unpacklo_epi64( q3, p1 );
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -656,17 +646,10 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
__m256i *chainv = state->chainv;
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
t0 = chainv[0];
t1 = chainv[1];
t0 = _mm256_xor_si256( t0, chainv[2] );
t1 = _mm256_xor_si256( t1, chainv[3] );
t0 = _mm256_xor_si256( t0, chainv[4] );
t1 = _mm256_xor_si256( t1, chainv[5] );
t0 = _mm256_xor_si256( t0, chainv[6] );
t1 = _mm256_xor_si256( t1, chainv[7] );
t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] );
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
MULT2( t0, t1 );
@@ -701,11 +684,8 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9] );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
t0 = chainv[8] = _mm256_xor_si256( chainv[8], t0 );
t1 = chainv[9] = _mm256_xor_si256( chainv[9], t1 );
MULT2( chainv[8], chainv[9] );
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
@@ -794,29 +774,22 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
{
uint32 hash[8*2] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t[2];
__m256i t0, t1;
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
/*---- blank round with m=0 ----*/
rnd512_2way( state, NULL );
t[0] = chainv[0];
t[1] = chainv[1];
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t0 = _mm256_shuffle_epi32( t0, 27 );
t1 = _mm256_shuffle_epi32( t1, 27 );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
@@ -825,22 +798,16 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
rnd512_2way( state, NULL );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
t0 = _mm256_shuffle_epi32( t0, 27 );
t1 = _mm256_shuffle_epi32( t1, 27 );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );

View File

@@ -22,20 +22,29 @@
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#define cns(i) ( ( (__m128i*)CNS_INIT)[i] )
#define ADD_CONSTANT( a, b, c0 ,c1 ) \
a = _mm_xor_si128( a, c0 ); \
b = _mm_xor_si128( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#define MULT2( a0, a1 ) \
{ \
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi32( a1, b, 1 ); \
a1 = _mm_alignr_epi32( b, a1, 1 ); \
__m128i b = _mm_xor_si128( a0, \
_mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
__m128i b = _mm_xor_si128( a0, \
_mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
} while(0)
@@ -44,79 +53,88 @@
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
__m128i b = _mm_xor_si128( a0, \
_mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#endif
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm_shuffle_epi32(a1,147);\
t0 = _mm_load_si128(&a1);\
a1 = _mm_unpacklo_epi32(a1,a0);\
t0 = _mm_unpackhi_epi32(t0,a0);\
t1 = _mm_shuffle_epi32(t0,78);\
a0 = _mm_shuffle_epi32(a1,78);\
SUBCRUMB(t1,t0,a0,a1,tmp0);\
t0 = _mm_unpacklo_epi32(t0,t1);\
a1 = _mm_unpacklo_epi32(a1,a0);\
a0 = _mm_load_si128(&a1);\
a0 = _mm_unpackhi_epi64(a0,t0);\
a1 = _mm_unpacklo_epi64(a1,t0);\
a1 = _mm_shuffle_epi32(a1,57);\
MIXWORD(a0,a1,tmp0,tmp1);\
ADD_CONSTANT(a0,a1,c0,c1);
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m128i t = a0; \
a0 = mm128_xoror( a3, a0, a1 ); \
a2 = _mm_xor_si128( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm128_xorand( a2, a3, t ); \
a2 = mm128_xorand( a1, a2, a0 ); \
a1 = _mm_or_si128( a1, a3 ); \
a3 = _mm_xor_si128( a3, a2 ); \
t = _mm_xor_si128( t, a1 ); \
a2 = _mm_and_si128( a2, a1 ); \
a1 = mm128_xnor( a1, a0 ); \
a0 = t; \
}
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm_load_si128(&a0);\
a0 = _mm_or_si128(a0,a1);\
a2 = _mm_xor_si128(a2,a3);\
a1 = mm128_not( a1 );\
a0 = _mm_xor_si128(a0,a3);\
a3 = _mm_and_si128(a3,t);\
a1 = _mm_xor_si128(a1,a3);\
a3 = _mm_xor_si128(a3,a2);\
a2 = _mm_and_si128(a2,a0);\
a0 = mm128_not( a0 );\
a2 = _mm_xor_si128(a2,a1);\
a1 = _mm_or_si128(a1,a3);\
t = _mm_xor_si128(t,a1);\
a3 = _mm_xor_si128(a3,a2);\
a2 = _mm_and_si128(a2,a1);\
a1 = _mm_xor_si128(a1,a0);\
a0 = _mm_load_si128(&t);\
#else
#define MIXWORD(a,b,t1,t2)\
b = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(a,2);\
t2 = _mm_srli_epi32(a,30);\
a = _mm_or_si128(t1,t2);\
a = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(b,14);\
t2 = _mm_srli_epi32(b,18);\
b = _mm_or_si128(t1,t2);\
b = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(a,10);\
t2 = _mm_srli_epi32(a,22);\
a = _mm_or_si128(t1,t2);\
a = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(b,1);\
t2 = _mm_srli_epi32(b,31);\
b = _mm_or_si128(t1,t2);
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m128i t = a0; \
a0 = _mm_or_si128( a0, a1 ); \
a2 = _mm_xor_si128( a2, a3 ); \
a1 = mm128_not( a1 ); \
a0 = _mm_xor_si128( a0, a3 ); \
a3 = _mm_and_si128( a3, t ); \
a1 = _mm_xor_si128( a1, a3 ); \
a3 = _mm_xor_si128( a3, a2 ); \
a2 = _mm_and_si128( a2, a0 ); \
a0 = mm128_not( a0 ); \
a2 = _mm_xor_si128( a2, a1 ); \
a1 = _mm_or_si128( a1, a3 ); \
t = _mm_xor_si128( t , a1 ); \
a3 = _mm_xor_si128( a3, a2 ); \
a2 = _mm_and_si128( a2, a1 ); \
a1 = _mm_xor_si128( a1, a0 ); \
a0 = t; \
}
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm_xor_si128(a,c0);\
b = _mm_xor_si128(b,c1);\
#endif
#define MIXWORD( a, b ) \
b = _mm_xor_si128( a, b ); \
a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \
b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \
a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \
b = mm128_rol_32( b, 1 );
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
SUBCRUMB( x0, x1, x2, x3 ); \
SUBCRUMB( x5, x6, x7, x4 ); \
MIXWORD( x0, x4 ); \
MIXWORD( x1, x5 ); \
MIXWORD( x2, x6 ); \
MIXWORD( x3, x7 ); \
ADD_CONSTANT( x0, x4, c0, c1 );
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
t0 = _mm_shuffle_epi32( a1, 147 ); \
a1 = _mm_unpacklo_epi32( t0, a0 ); \
t0 = _mm_unpackhi_epi32( t0, a0 ); \
t1 = _mm_shuffle_epi32( t0, 78 ); \
a0 = _mm_shuffle_epi32( a1, 78 ); \
SUBCRUMB( t1, t0, a0, a1 ); \
t0 = _mm_unpacklo_epi32( t0, t1 ); \
a1 = _mm_unpacklo_epi32( a1, a0 ); \
a0 = _mm_unpackhi_epi64( a1, t0 ); \
a1 = _mm_unpacklo_epi64( a1, t0 ); \
a1 = _mm_shuffle_epi32( a1, 57 ); \
MIXWORD( a0, a1 ); \
ADD_CONSTANT( a0, a1, c0, c1 );
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm_load_si128(&r1);\
@@ -177,32 +195,22 @@
q1 = _mm_load_si128(&p1);\
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm_load_si128(&r3);\
q1 = _mm_load_si128(&p3);\
s3 = _mm_load_si128(&r3);\
q3 = _mm_load_si128(&p3);\
s1 = _mm_unpackhi_epi32(s1,r2);\
q1 = _mm_unpackhi_epi32(q1,p2);\
s3 = _mm_unpacklo_epi32(s3,r2);\
q3 = _mm_unpacklo_epi32(q3,p2);\
s0 = _mm_load_si128(&s1);\
q0 = _mm_load_si128(&q1);\
s2 = _mm_load_si128(&s3);\
q2 = _mm_load_si128(&q3);\
r3 = _mm_load_si128(&r1);\
p3 = _mm_load_si128(&p1);\
r1 = _mm_unpacklo_epi32(r1,r0);\
p1 = _mm_unpacklo_epi32(p1,p0);\
r3 = _mm_unpackhi_epi32(r3,r0);\
p3 = _mm_unpackhi_epi32(p3,p0);\
s0 = _mm_unpackhi_epi64(s0,r3);\
q0 = _mm_unpackhi_epi64(q0,p3);\
s1 = _mm_unpacklo_epi64(s1,r3);\
q1 = _mm_unpacklo_epi64(q1,p3);\
s2 = _mm_unpackhi_epi64(s2,r1);\
q2 = _mm_unpackhi_epi64(q2,p1);\
s3 = _mm_unpacklo_epi64(s3,r1);\
q3 = _mm_unpacklo_epi64(q3,p1);
s1 = _mm_unpackhi_epi32( r3, r2 ); \
q1 = _mm_unpackhi_epi32( p3, p2 ); \
s3 = _mm_unpacklo_epi32( r3, r2 ); \
q3 = _mm_unpacklo_epi32( p3, p2 ); \
r3 = _mm_unpackhi_epi32( r1, r0 ); \
r1 = _mm_unpacklo_epi32( r1, r0 ); \
p3 = _mm_unpackhi_epi32( p1, p0 ); \
p1 = _mm_unpacklo_epi32( p1, p0 ); \
s0 = _mm_unpackhi_epi64( s1, r3 ); \
q0 = _mm_unpackhi_epi64( q1 ,p3 ); \
s1 = _mm_unpacklo_epi64( s1, r3 ); \
q1 = _mm_unpacklo_epi64( q1, p3 ); \
s2 = _mm_unpackhi_epi64( s3, r1 ); \
q2 = _mm_unpackhi_epi64( q3, p1 ); \
s3 = _mm_unpacklo_epi64( s3, r1 ); \
q3 = _mm_unpacklo_epi64( q3, p1 );
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -306,8 +314,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
casti_m128i( state->buffer, 1 ) = _mm_set_epi32( 0, 0, 0, 0x80000000 );
}
return SUCCESS;
@@ -325,8 +332,7 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
else
{
// empty pad block, constant data
rnd512( state, _mm_setzero_si128(),
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) );
}
finalization512(state, (uint32*) hashval);
@@ -423,163 +429,119 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
{
__m128i t[2];
__m128i t0, t1;
__m128i *chainv = state->chainv;
__m128i tmp[2];
__m128i x[8];
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
t[0] = chainv[0];
t[1] = chainv[1];
t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm128_xor3( t0, chainv[6], chainv[8] );
t1 = mm128_xor3( t1, chainv[7], chainv[9] );
t[0] = _mm_xor_si128( t[0], chainv[2] );
t[1] = _mm_xor_si128( t[1], chainv[3] );
t[0] = _mm_xor_si128( t[0], chainv[4] );
t[1] = _mm_xor_si128( t[1], chainv[5] );
t[0] = _mm_xor_si128( t[0], chainv[6] );
t[1] = _mm_xor_si128( t[1], chainv[7] );
t[0] = _mm_xor_si128( t[0], chainv[8] );
t[1] = _mm_xor_si128( t[1], chainv[9] );
MULT2( t[0], t[1] );
MULT2( t0, t1 );
msg0 = _mm_shuffle_epi32( msg0, 27 );
msg1 = _mm_shuffle_epi32( msg1, 27 );
chainv[0] = _mm_xor_si128( chainv[0], t[0] );
chainv[1] = _mm_xor_si128( chainv[1], t[1] );
chainv[2] = _mm_xor_si128( chainv[2], t[0] );
chainv[3] = _mm_xor_si128( chainv[3], t[1] );
chainv[4] = _mm_xor_si128( chainv[4], t[0] );
chainv[5] = _mm_xor_si128( chainv[5], t[1] );
chainv[6] = _mm_xor_si128( chainv[6], t[0] );
chainv[7] = _mm_xor_si128( chainv[7], t[1] );
chainv[8] = _mm_xor_si128( chainv[8], t[0] );
chainv[9] = _mm_xor_si128( chainv[9], t[1] );
chainv[0] = _mm_xor_si128( chainv[0], t0 );
chainv[1] = _mm_xor_si128( chainv[1], t1 );
chainv[2] = _mm_xor_si128( chainv[2], t0 );
chainv[3] = _mm_xor_si128( chainv[3], t1 );
chainv[4] = _mm_xor_si128( chainv[4], t0 );
chainv[5] = _mm_xor_si128( chainv[5], t1 );
chainv[6] = _mm_xor_si128( chainv[6], t0 );
chainv[7] = _mm_xor_si128( chainv[7], t1 );
chainv[8] = _mm_xor_si128( chainv[8], t0 );
chainv[9] = _mm_xor_si128( chainv[9], t1 );
t[0] = chainv[0];
t[1] = chainv[1];
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm_xor_si128( chainv[0], chainv[2] );
chainv[1] = _mm_xor_si128( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
chainv[2] = _mm_xor_si128(chainv[2], chainv[4]);
chainv[3] = _mm_xor_si128(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm_xor_si128(chainv[4], chainv[6]);
chainv[5] = _mm_xor_si128(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm_xor_si128(chainv[6], chainv[8]);
chainv[7] = _mm_xor_si128(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm_xor_si128( chainv[8], t[0] );
chainv[9] = _mm_xor_si128( chainv[9], t[1] );
t[0] = chainv[8];
t[1] = chainv[9];
t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 );
t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 );
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm_xor_si128( chainv[8], chainv[6] );
chainv[9] = _mm_xor_si128( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm_xor_si128( chainv[6], chainv[4] );
chainv[7] = _mm_xor_si128( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm_xor_si128( chainv[4], chainv[2] );
chainv[5] = _mm_xor_si128( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm_xor_si128( chainv[2], chainv[0] );
chainv[3] = _mm_xor_si128( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg0 );
chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg1 );
chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 );
chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm_xor_si128( chainv[2], msg0 );
chainv[3] = _mm_xor_si128( chainv[3], msg1 );
MULT2( msg0, msg1);
chainv[4] = _mm_xor_si128( chainv[4], msg0 );
chainv[5] = _mm_xor_si128( chainv[5], msg1 );
MULT2( msg0, msg1);
chainv[6] = _mm_xor_si128( chainv[6], msg0 );
chainv[7] = _mm_xor_si128( chainv[7], msg1 );
MULT2( msg0, msg1);
chainv[8] = _mm_xor_si128( chainv[8], msg0 );
chainv[9] = _mm_xor_si128( chainv[9], msg1 );
MULT2( msg0, msg1);
chainv[3] = mm128_rol_32( chainv[3], 1 );
chainv[5] = mm128_rol_32( chainv[5], 2 );
chainv[7] = mm128_rol_32( chainv[7], 3 );
chainv[9] = mm128_rol_32( chainv[9], 4 );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
_mm_srli_epi32(chainv[3], 31) );
chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2),
_mm_srli_epi32(chainv[5], 30) );
chainv[7] = _mm_or_si128( _mm_slli_epi32(chainv[7], 3),
_mm_srli_epi32(chainv[7], 29) );
chainv[9] = _mm_or_si128( _mm_slli_epi32(chainv[9], 4),
_mm_srli_epi32(chainv[9], 28) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );
MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS128[ 0], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 2], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 4], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 6], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 8], &tmp[0] );
STEP_PART( &x[0], &CNS128[10], &tmp[0] );
STEP_PART( &x[0], &CNS128[12], &tmp[0] );
STEP_PART( &x[0], &CNS128[14], &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[16], CNS128[17],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[18], CNS128[19],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[20], CNS128[21],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[22], CNS128[23],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[24], CNS128[25],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[26], CNS128[27],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[28], CNS128[29],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[30], CNS128[31],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
}
@@ -588,51 +550,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
/* state: hash context */
/* b[8]: hash values */
#if defined (__AVX2__)
static void finalization512( hashState_luffa *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = (__m256i*)state->chainv;
__m256i t;
const __m128i zero = m128_zero;
const __m256i shuff_bswap32 = _mm256_set_epi64x( 0x1c1d1e1f18191a1b,
0x1415161710111213,
0x0c0d0e0f08090a0b,
0x0405060700010203 );
rnd512( state, zero, zero );
t = chainv[0];
t = _mm256_xor_si256( t, chainv[1] );
t = _mm256_xor_si256( t, chainv[2] );
t = _mm256_xor_si256( t, chainv[3] );
t = _mm256_xor_si256( t, chainv[4] );
t = _mm256_shuffle_epi32( t, 27 );
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
rnd512( state, zero, zero );
t = chainv[0];
t = _mm256_xor_si256( t, chainv[1] );
t = _mm256_xor_si256( t, chainv[2] );
t = _mm256_xor_si256( t, chainv[3] );
t = _mm256_xor_si256( t, chainv[4] );
t = _mm256_shuffle_epi32( t, 27 );
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
}
#else
static void finalization512( hashState_luffa *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
@@ -685,6 +602,5 @@ static void finalization512( hashState_luffa *state, uint32 *b )
casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
}
#endif
/***************************************************/

View File

@@ -212,7 +212,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -398,7 +398,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );

View File

@@ -203,7 +203,7 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
*noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -345,7 +345,7 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -287,7 +287,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -389,7 +389,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
*noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
pdata[19] = n;

View File

@@ -103,7 +103,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -213,7 +213,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
@@ -328,7 +328,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
*noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
n += 4;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

View File

@@ -62,10 +62,10 @@ inline void initState( uint64_t State[/*16*/] )
state[1] = zero;
state[2] = zero;
state[3] = zero;
state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
state[4] = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state[5] = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state[6] = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state[7] = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
#else
//First 512 bis are zeros
@@ -299,10 +299,10 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
state1 =
state2 =
state3 = m128_zero;
state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
state4 = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state5 = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state6 = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state7 = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
for ( int i = 0; i < nBlocks; i++ )
{

View File

@@ -43,27 +43,29 @@ static const uint64_t blake2b_IV[8] =
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
/*Blake2b's rotation*/
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
// serial data is only 32 bytes so AVX2 is the limit for that dimension.
// However, 2 way parallel looks trivial to code for AVX512 except for
// a data dependency with rowa.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define G2W_4X64(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 24 ); \
a = _mm512_add_epi64( a, b ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s0 = mm512_shufll256_64( s0 ); \
s3 = mm512_swap256_128( s3); \
s2 = mm512_shuflr256_64( s2 ); \
G2W_4X64( s0, s1, s2, s3 ); \
s0 = mm512_shuflr256_64( s0 ); \
s3 = mm512_swap256_128( s3 ); \
s2 = mm512_shufll256_64( s2 );
/*
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s3 = mm512_shufll256_64( s3 ); \
@@ -73,6 +75,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
s3 = mm512_shuflr256_64( s3 ); \
s1 = mm512_shufll256_64( s1 ); \
s2 = mm512_swap256_128( s2 );
*/
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -88,13 +91,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
#endif // AVX512
#if defined __AVX2__
#if defined(__AVX2__)
// process 4 columns in parallel
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
@@ -105,6 +105,18 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
// Pivot about s1 instead of s0 reduces latency.
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s0 = mm256_shufll_64( s0 ); \
s3 = mm256_swap_128( s3); \
s2 = mm256_shuflr_64( s2 ); \
G_4X64( s0, s1, s2, s3 ); \
s0 = mm256_shuflr_64( s0 ); \
s3 = mm256_swap_128( s3 ); \
s2 = mm256_shufll_64( s2 );
/*
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s3 = mm256_shufll_64( s3 ); \
@@ -114,6 +126,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
s3 = mm256_shuflr_64( s3 ); \
s1 = mm256_shufll_64( s1 ); \
s2 = mm256_swap_128( s2 );
*/
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -182,8 +195,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#endif // AVX2 else SSE2
// Scalar
//Blake2b's G function
/*
// Scalar, not used.
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
#define G(r,i,a,b,c,d) \
do { \
a = a + b; \
@@ -196,8 +214,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
b = rotr64(b ^ c, 63); \
} while(0)
/*One Round of the Blake2b's compression function*/
#define ROUND_LYRA(r) \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -207,6 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -51,7 +51,7 @@ void anime_8way_hash( void *state, const void *input )
__m512i* vhA = (__m512i*)vhashA;
__m512i* vhB = (__m512i*)vhashB;
__m512i* vhC = (__m512i*)vhashC;
const __m512i bit3_mask = m512_const1_64( 8 );
const __m512i bit3_mask = _mm512_set1_epi64( 8 );
__mmask8 vh_mask;
anime_8way_context_overlay ctx __attribute__ ((aligned (64)));
@@ -209,7 +209,7 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -248,7 +248,7 @@ void anime_4way_hash( void *state, const void *input )
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
int h_mask;
const __m256i bit3_mask = m256_const1_64( 8 );
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i zero = _mm256_setzero_si256();
anime_4way_context_overlay ctx __attribute__ ((aligned (64)));
@@ -388,7 +388,7 @@ int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -75,7 +75,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
uint32_t hash7 [16] __attribute__ ((aligned (32)));
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
__mmask8 vh_mask;
const __m512i vmask = m512_const1_64( 24 );
const __m512i vmask = _mm512_set1_epi64( 24 );
const uint32_t mask = 24;
__m512i* vh = (__m512i*)vhash;
__m512i* vhA = (__m512i*)vhashA;
@@ -593,7 +593,7 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
@@ -647,7 +647,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
int h_mask;
const __m256i vmask = m256_const1_64( 24 );
const __m256i vmask = _mm256_set1_epi64x( 24 );
const uint32_t mask = 24;
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
@@ -1041,7 +1041,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -67,7 +67,7 @@ void quark_8way_hash( void *state, const void *input )
__mmask8 vh_mask;
quark_8way_ctx_holder ctx;
const uint32_t mask = 8;
const __m512i bit3_mask = m512_const1_64( mask );
const __m512i bit3_mask = _mm512_set1_epi64( mask );
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
@@ -224,7 +224,7 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
@@ -271,7 +271,7 @@ void quark_4way_hash( void *state, const void *input )
__m256i vh_mask;
int h_mask;
quark_4way_ctx_holder ctx;
const __m256i bit3_mask = m256_const1_64( 8 );
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i zero = _mm256_setzero_si256();
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
@@ -397,7 +397,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

View File

@@ -47,7 +47,7 @@ static const uint32_t IV[5] =
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
m128_const1_64( k ) ), s ), e ); \
_mm_set1_epi64x( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
} while (0)
@@ -251,11 +251,11 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )
void ripemd160_4way_init( ripemd160_4way_context *sc )
{
sc->val[0] = m128_const1_64( 0x6745230167452301 );
sc->val[1] = m128_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m128_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m128_const1_64( 0x1032547610325476 );
sc->val[4] = m128_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
sc->val[2] = _mm_set1_epi64x( 0x98BADCFE98BADCFE );
sc->val[3] = _mm_set1_epi64x( 0x1032547610325476 );
sc->val[4] = _mm_set1_epi64x( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
@@ -347,7 +347,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
do{ \
a = _mm256_add_epi32( mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( \
_mm256_add_epi32( a, f( b ,c, d ) ), r ), \
m256_const1_64( k ) ), s ), e ); \
_mm256_set1_epi64x( k ) ), s ), e ); \
c = mm256_rol_32( c, 10 );\
} while (0)
@@ -552,11 +552,11 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )
void ripemd160_8way_init( ripemd160_8way_context *sc )
{
sc->val[0] = m256_const1_64( 0x6745230167452301 );
sc->val[1] = m256_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m256_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m256_const1_64( 0x1032547610325476 );
sc->val[4] = m256_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
sc->val[2] = _mm256_set1_epi64x( 0x98BADCFE98BADCFE );
sc->val[3] = _mm256_set1_epi64x( 0x1032547610325476 );
sc->val[4] = _mm256_set1_epi64x( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
@@ -649,7 +649,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
do{ \
a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
_mm512_add_epi32( a, f( b ,c, d ) ), r ), \
m512_const1_64( k ) ), s ), e ); \
_mm512_set1_epi64( k ) ), s ), e ); \
c = mm512_rol_32( c, 10 );\
} while (0)
@@ -853,11 +853,11 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc )
void ripemd160_16way_init( ripemd160_16way_context *sc )
{
sc->val[0] = m512_const1_64( 0x6745230167452301 );
sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m512_const1_64( 0x1032547610325476 );
sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = _mm512_set1_epi64( 0x98BADCFE98BADCFE );
sc->val[3] = _mm512_set1_epi64( 0x1032547610325476 );
sc->val[4] = _mm512_set1_epi64( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
@@ -902,7 +902,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
const int pad = block_size - 8;
ptr = (unsigned)sc->count_low & ( block_size - 1U);
sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
sc->buf[ ptr>>2 ] = _mm512_set1_epi32( 0x80 );
ptr += 4;
if ( ptr > pad )

View File

@@ -311,7 +311,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
__m128i A, B, C, D, E, F, G, H;
__m128i W[16]; memcpy_128( W, data, 16 );
// Value required by H after round 60 to produce valid final hash
const __m128i H_ = m128_const1_32( 0x136032ED );
const __m128i H_ = _mm_set1_epi32( 0x136032ED );
A = _mm_load_si128( state_in );
B = _mm_load_si128( state_in+1 );
@@ -408,14 +408,14 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
void sha256_4way_init( sha256_4way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 );
sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A );
sc->val[4] = m128_const1_64( 0x510E527F510E527F );
sc->val[5] = m128_const1_64( 0x9B05688C9B05688C );
sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
}
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -458,7 +458,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -474,8 +474,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
@@ -589,7 +589,6 @@ do { \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -863,7 +862,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
{
__m256i A, B, C, D, E, F, G, H;
__m256i W[16]; memcpy_256( W, data, 16 );
const __m256i H_ = m256_const1_32( 0x136032ED );
const __m256i H_ = _mm256_set1_epi32( 0x136032ED );
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in+1 );
@@ -979,14 +978,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
void sha256_8way_init( sha256_8way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
sc->val[4] = m256_const1_64( 0x510E527F510E527F );
sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
}
// need to handle odd byte length for yespower.
@@ -1032,7 +1031,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1048,8 +1047,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
@@ -1360,7 +1359,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
// Value for H at round 60, before adding K, needed to produce valid final
// hash where H == 0.
// H_ = -( H256[7] + K256[60] );
const __m512i H_ = m512_const1_32( 0x136032ED );
const __m512i H_ = _mm512_set1_epi32( 0x136032ED );
A = _mm512_load_si512( state_in );
B = _mm512_load_si512( state_in+1 );
@@ -1453,14 +1452,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
void sha256_16way_init( sha256_16way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 );
sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A );
sc->val[4] = m512_const1_64( 0x510E527F510E527F );
sc->val[5] = m512_const1_64( 0x9B05688C9B05688C );
sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
}
void sha256_16way_update( sha256_16way_context *sc, const void *data,
@@ -1504,7 +1503,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1520,8 +1519,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );

View File

@@ -28,32 +28,32 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
const __m512i sixteen = m512_const1_32( 16 );
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
vdata[i] = _mm512_set1_epi32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m512_const1_64( 0x510E527F510E527F );
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sha256_16way_transform_le( midstate1, vdata, initstate );
@@ -116,31 +116,31 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = m256_const1_32( 0x80000000 );
const __m256i eight = m256_const1_32( 8 );
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = _mm256_set1_epi32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m256_const1_64( 0x510E527F510E527F );
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sha256_8way_transform_le( midstate1, vdata, initstate );
@@ -204,31 +204,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
vdata[i] = _mm_set1_epi32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );

View File

@@ -68,7 +68,7 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
*noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
@@ -140,7 +140,7 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;

View File

@@ -28,31 +28,31 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
const __m512i sixteen = m512_const1_32( 16 );
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
vdata[i] = _mm512_set1_epi32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m512_const1_64( 0x510E527F510E527F );
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sha256_16way_transform_le( midstate1, vdata, initstate );
@@ -120,31 +120,31 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = m256_const1_32( 0x80000000 );
const __m256i eight = m256_const1_32( 8 );
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = _mm256_set1_epi32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m256_const1_64( 0x510E527F510E527F );
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sha256_8way_transform_le( midstate1, vdata, initstate );
@@ -215,31 +215,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
vdata[i] = _mm_set1_epi32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );
@@ -302,31 +302,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
vdata[i] = _mm_set1_epi32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate, vdata, initstate );

View File

@@ -243,7 +243,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
@@ -268,51 +268,56 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
// SHA-512 4 way 64 bit
#define BSG5_0( x ) mm256_xor3( mm256_ror_64( x, 28 ), \
mm256_ror_64( x, 34 ), \
mm256_ror_64( x, 39 ) )
#define BSG5_1( x ) mm256_xor3( mm256_ror_64( x, 14 ), \
mm256_ror_64( x, 18 ), \
mm256_ror_64( x, 41 ) )
#define SSG5_0( x ) mm256_xor3( mm256_ror_64( x, 1 ), \
mm256_ror_64( x, 8 ), \
_mm256_srli_epi64( x, 7 ) )
#define SSG5_1( x ) mm256_xor3( mm256_ror_64( x, 19 ), \
mm256_ror_64( x, 61 ), \
_mm256_srli_epi64( x, 6 ) )
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
// becomes available.
#define CH( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xca )
#define MAJ( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 )
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
T1 = _mm256_add_epi64( T1, H ); \
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
T1 = _mm256_add_epi64( T1, T0 ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
#else // AVX2 only
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJ(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
#define BSG5_0(x) \
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
_mm256_xor_si256( mm256_ror_64( x, 5 ), x ), 6 ), x ), 28 )
#define BSG5_1(x) \
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
/*
#define SSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) )
#define SSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
*/
// Interleave SSG0 & SSG1 for better throughput.
// return ssg0(w0) + ssg1(w1)
static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
{
__m256i w0a, w1a, w0b, w1b;
w0a = mm256_ror_64( w0, 1 );
w1a = mm256_ror_64( w1,19 );
w0b = mm256_ror_64( w0, 8 );
w1b = mm256_ror_64( w1,61 );
w0a = _mm256_xor_si256( w0a, w0b );
w1a = _mm256_xor_si256( w1a, w1b );
w0b = _mm256_srli_epi64( w0, 7 );
w1b = _mm256_srli_epi64( w1, 6 );
w0a = _mm256_xor_si256( w0a, w0b );
w1a = _mm256_xor_si256( w1a, w1b );
return _mm256_add_epi64( w0a, w1a );
}
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -324,19 +329,27 @@ do { \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
#endif // AVX512VL AVX10_256
static void
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
{
int i;
register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
register __m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
// Disable for AVX10_256
__m256i X_xor_Y, Y_xor_Z;
#endif
__m256i W[80];
mm256_block_bswap_64( W , in );
mm256_block_bswap_64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = _mm256_add_epi64( ssg512_add( W[i-15], W[i-2] ),
_mm256_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
W[i] = mm256_add4_64( SSG5_0( W[i-15] ), SSG5_1( W[i-2] ),
W[ i- 7 ], W[ i-16 ] );
if ( ctx->initialized )
{
@@ -351,17 +364,20 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
}
else
{
A = m256_const1_64( 0x6A09E667F3BCC908 );
B = m256_const1_64( 0xBB67AE8584CAA73B );
C = m256_const1_64( 0x3C6EF372FE94F82B );
D = m256_const1_64( 0xA54FF53A5F1D36F1 );
E = m256_const1_64( 0x510E527FADE682D1 );
F = m256_const1_64( 0x9B05688C2B3E6C1F );
G = m256_const1_64( 0x1F83D9ABFB41BD6B );
H = m256_const1_64( 0x5BE0CD19137E2179 );
A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
}
#if !defined(__AVX512VL__)
// Disable for AVX10_256
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
for ( i = 0; i < 80; i += 8 )
{
@@ -389,14 +405,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
else
{
ctx->initialized = true;
r[0] = _mm256_add_epi64( A, m256_const1_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, m256_const1_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, m256_const1_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, m256_const1_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, m256_const1_64( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, m256_const1_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, m256_const1_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, m256_const1_64( 0x5BE0CD19137E2179 ) );
r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
}
}
@@ -441,7 +457,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
ptr += 8;
if ( ptr > pad )
{

View File

@@ -112,50 +112,50 @@ extern "C"{
else \
{ \
(state)->state_loaded = true; \
A0 = m256_const1_64( 0x20728DFD20728DFD ); \
A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m256_const1_64( 0xE782B699E782B699 ); \
A3 = m256_const1_64( 0x5530463255304632 ); \
A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m256_const1_64( 0x8BD144108BD14410 ); \
AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
B8 = m256_const1_64( 0x48910A5A48910A5A ); \
B9 = m256_const1_64( 0x893B22DB893B22DB ); \
BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
BC = m256_const1_64( 0x72D2F24072D2F240 ); \
BD = m256_const1_64( 0x75941D9975941D99 ); \
BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
C2 = m256_const1_64( 0x56028CB256028CB2 ); \
C3 = m256_const1_64( 0x8134F3598134F359 ); \
C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
C7 = m256_const1_64( 0x0405278004052780 ); \
C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
C9 = m256_const1_64( 0x5194358F5194358F ); \
CA = m256_const1_64( 0x3C60D6653C60D665 ); \
CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
CC = m256_const1_64( 0x950C3434950C3434 ); \
CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
A0 = _mm256_set1_epi64x( 0x20728DFD20728DFD ); \
A1 = _mm256_set1_epi64x( 0x46C0BD5346C0BD53 ); \
A2 = _mm256_set1_epi64x( 0xE782B699E782B699 ); \
A3 = _mm256_set1_epi64x( 0x5530463255304632 ); \
A4 = _mm256_set1_epi64x( 0x71B4EF9071B4EF90 ); \
A5 = _mm256_set1_epi64x( 0x0EA9E82C0EA9E82C ); \
A6 = _mm256_set1_epi64x( 0xDBB930F1DBB930F1 ); \
A7 = _mm256_set1_epi64x( 0xFAD06B8BFAD06B8B ); \
A8 = _mm256_set1_epi64x( 0xBE0CAE40BE0CAE40 ); \
A9 = _mm256_set1_epi64x( 0x8BD144108BD14410 ); \
AA = _mm256_set1_epi64x( 0x76D2ADAC76D2ADAC ); \
AB = _mm256_set1_epi64x( 0x28ACAB7F28ACAB7F ); \
B0 = _mm256_set1_epi64x( 0xC1099CB7C1099CB7 ); \
B1 = _mm256_set1_epi64x( 0x07B385F307B385F3 ); \
B2 = _mm256_set1_epi64x( 0xE7442C26E7442C26 ); \
B3 = _mm256_set1_epi64x( 0xCC8AD640CC8AD640 ); \
B4 = _mm256_set1_epi64x( 0xEB6F56C7EB6F56C7 ); \
B5 = _mm256_set1_epi64x( 0x1EA81AA91EA81AA9 ); \
B6 = _mm256_set1_epi64x( 0x73B9D31473B9D314 ); \
B7 = _mm256_set1_epi64x( 0x1DE85D081DE85D08 ); \
B8 = _mm256_set1_epi64x( 0x48910A5A48910A5A ); \
B9 = _mm256_set1_epi64x( 0x893B22DB893B22DB ); \
BA = _mm256_set1_epi64x( 0xC5A0DF44C5A0DF44 ); \
BB = _mm256_set1_epi64x( 0xBBC4324EBBC4324E ); \
BC = _mm256_set1_epi64x( 0x72D2F24072D2F240 ); \
BD = _mm256_set1_epi64x( 0x75941D9975941D99 ); \
BE = _mm256_set1_epi64x( 0x6D8BDE826D8BDE82 ); \
BF = _mm256_set1_epi64x( 0xA1A7502BA1A7502B ); \
C0 = _mm256_set1_epi64x( 0xD9BF68D1D9BF68D1 ); \
C1 = _mm256_set1_epi64x( 0x58BAD75058BAD750 ); \
C2 = _mm256_set1_epi64x( 0x56028CB256028CB2 ); \
C3 = _mm256_set1_epi64x( 0x8134F3598134F359 ); \
C4 = _mm256_set1_epi64x( 0xB5D469D8B5D469D8 ); \
C5 = _mm256_set1_epi64x( 0x941A8CC2941A8CC2 ); \
C6 = _mm256_set1_epi64x( 0x418B2A6E418B2A6E ); \
C7 = _mm256_set1_epi64x( 0x0405278004052780 ); \
C8 = _mm256_set1_epi64x( 0x7F07D7877F07D787 ); \
C9 = _mm256_set1_epi64x( 0x5194358F5194358F ); \
CA = _mm256_set1_epi64x( 0x3C60D6653C60D665 ); \
CB = _mm256_set1_epi64x( 0xBE97D79ABE97D79A ); \
CC = _mm256_set1_epi64x( 0x950C3434950C3434 ); \
CD = _mm256_set1_epi64x( 0xAED9A06DAED9A06D ); \
CE = _mm256_set1_epi64x( 0x2537DC8D2537DC8D ); \
CF = _mm256_set1_epi64x( 0x7CDB59697CDB5969 ); \
} \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
@@ -303,7 +303,7 @@ do { \
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
@@ -443,52 +443,52 @@ shabal_8way_init( void *cc, unsigned size )
else
{ // No users
sc->state_loaded = true;
sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
sc->A[ 0] = _mm256_set1_epi64x( 0x52F8455252F84552 );
sc->A[ 1] = _mm256_set1_epi64x( 0xE54B7999E54B7999 );
sc->A[ 2] = _mm256_set1_epi64x( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = _mm256_set1_epi64x( 0xB9645191B9645191 );
sc->A[ 4] = _mm256_set1_epi64x( 0xE0078B86E0078B86 );
sc->A[ 5] = _mm256_set1_epi64x( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = _mm256_set1_epi64x( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = _mm256_set1_epi64x( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = _mm256_set1_epi64x( 0x14CE5A4514CE5A45 );
sc->A[ 9] = _mm256_set1_epi64x( 0x22AF50DC22AF50DC );
sc->A[10] = _mm256_set1_epi64x( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = _mm256_set1_epi64x( 0xEB21B74AEB21B74A );
sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
sc->B[ 0] = _mm256_set1_epi64x( 0xB555C6EEB555C6EE );
sc->B[ 1] = _mm256_set1_epi64x( 0x3E7105963E710596 );
sc->B[ 2] = _mm256_set1_epi64x( 0xA72A652FA72A652F );
sc->B[ 3] = _mm256_set1_epi64x( 0x9301515F9301515F );
sc->B[ 4] = _mm256_set1_epi64x( 0xDA28C1FADA28C1FA );
sc->B[ 5] = _mm256_set1_epi64x( 0x696FD868696FD868 );
sc->B[ 6] = _mm256_set1_epi64x( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = _mm256_set1_epi64x( 0x0AFE40020AFE4002 );
sc->B[ 8] = _mm256_set1_epi64x( 0xA6E03615A6E03615 );
sc->B[ 9] = _mm256_set1_epi64x( 0x5138C1D45138C1D4 );
sc->B[10] = _mm256_set1_epi64x( 0xBE216306BE216306 );
sc->B[11] = _mm256_set1_epi64x( 0xB38B8890B38B8890 );
sc->B[12] = _mm256_set1_epi64x( 0x3EA8B96B3EA8B96B );
sc->B[13] = _mm256_set1_epi64x( 0x3299ACE43299ACE4 );
sc->B[14] = _mm256_set1_epi64x( 0x30924DD430924DD4 );
sc->B[15] = _mm256_set1_epi64x( 0x55CB34A555CB34A5 );
sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
sc->C[ 0] = _mm256_set1_epi64x( 0xB405F031B405F031 );
sc->C[ 1] = _mm256_set1_epi64x( 0xC4233EBAC4233EBA );
sc->C[ 2] = _mm256_set1_epi64x( 0xB3733979B3733979 );
sc->C[ 3] = _mm256_set1_epi64x( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = _mm256_set1_epi64x( 0xC51C28AEC51C28AE );
sc->C[ 5] = _mm256_set1_epi64x( 0xA327B8E1A327B8E1 );
sc->C[ 6] = _mm256_set1_epi64x( 0x56C5616756C56167 );
sc->C[ 7] = _mm256_set1_epi64x( 0xED614433ED614433 );
sc->C[ 8] = _mm256_set1_epi64x( 0x88B59D6088B59D60 );
sc->C[ 9] = _mm256_set1_epi64x( 0x60E2CEBA60E2CEBA );
sc->C[10] = _mm256_set1_epi64x( 0x758B4B8B758B4B8B );
sc->C[11] = _mm256_set1_epi64x( 0x83E82A7F83E82A7F );
sc->C[12] = _mm256_set1_epi64x( 0xBC968828BC968828 );
sc->C[13] = _mm256_set1_epi64x( 0xE6E00BF7E6E00BF7 );
sc->C[14] = _mm256_set1_epi64x( 0xBA839E55BA839E55 );
sc->C[15] = _mm256_set1_epi64x( 0x9B491C609B491C60 );
}
sc->Wlow = 1;
sc->Whigh = 0;
@@ -707,50 +707,50 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
else \
{ \
(state)->state_loaded = true; \
A0 = m128_const1_64( 0x20728DFD20728DFD ); \
A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m128_const1_64( 0xE782B699E782B699 ); \
A3 = m128_const1_64( 0x5530463255304632 ); \
A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m128_const1_64( 0x8BD144108BD14410 ); \
AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m128_const1_64( 0x07B385F307B385F3 ); \
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
B3 = m128_const1_64( 0xCC8AD640CC8AD640 ); \
B4 = m128_const1_64( 0xEB6F56C7EB6F56C7 ); \
B5 = m128_const1_64( 0x1EA81AA91EA81AA9 ); \
B6 = m128_const1_64( 0x73B9D31473B9D314 ); \
B7 = m128_const1_64( 0x1DE85D081DE85D08 ); \
B8 = m128_const1_64( 0x48910A5A48910A5A ); \
B9 = m128_const1_64( 0x893B22DB893B22DB ); \
BA = m128_const1_64( 0xC5A0DF44C5A0DF44 ); \
BB = m128_const1_64( 0xBBC4324EBBC4324E ); \
BC = m128_const1_64( 0x72D2F24072D2F240 ); \
BD = m128_const1_64( 0x75941D9975941D99 ); \
BE = m128_const1_64( 0x6D8BDE826D8BDE82 ); \
BF = m128_const1_64( 0xA1A7502BA1A7502B ); \
C0 = m128_const1_64( 0xD9BF68D1D9BF68D1 ); \
C1 = m128_const1_64( 0x58BAD75058BAD750 ); \
C2 = m128_const1_64( 0x56028CB256028CB2 ); \
C3 = m128_const1_64( 0x8134F3598134F359 ); \
C4 = m128_const1_64( 0xB5D469D8B5D469D8 ); \
C5 = m128_const1_64( 0x941A8CC2941A8CC2 ); \
C6 = m128_const1_64( 0x418B2A6E418B2A6E ); \
C7 = m128_const1_64( 0x0405278004052780 ); \
C8 = m128_const1_64( 0x7F07D7877F07D787 ); \
C9 = m128_const1_64( 0x5194358F5194358F ); \
CA = m128_const1_64( 0x3C60D6653C60D665 ); \
CB = m128_const1_64( 0xBE97D79ABE97D79A ); \
CC = m128_const1_64( 0x950C3434950C3434 ); \
CD = m128_const1_64( 0xAED9A06DAED9A06D ); \
CE = m128_const1_64( 0x2537DC8D2537DC8D ); \
CF = m128_const1_64( 0x7CDB59697CDB5969 ); \
A0 = _mm_set1_epi64x( 0x20728DFD20728DFD ); \
A1 = _mm_set1_epi64x( 0x46C0BD5346C0BD53 ); \
A2 = _mm_set1_epi64x( 0xE782B699E782B699 ); \
A3 = _mm_set1_epi64x( 0x5530463255304632 ); \
A4 = _mm_set1_epi64x( 0x71B4EF9071B4EF90 ); \
A5 = _mm_set1_epi64x( 0x0EA9E82C0EA9E82C ); \
A6 = _mm_set1_epi64x( 0xDBB930F1DBB930F1 ); \
A7 = _mm_set1_epi64x( 0xFAD06B8BFAD06B8B ); \
A8 = _mm_set1_epi64x( 0xBE0CAE40BE0CAE40 ); \
A9 = _mm_set1_epi64x( 0x8BD144108BD14410 ); \
AA = _mm_set1_epi64x( 0x76D2ADAC76D2ADAC ); \
AB = _mm_set1_epi64x( 0x28ACAB7F28ACAB7F ); \
B0 = _mm_set1_epi64x( 0xC1099CB7C1099CB7 ); \
B1 = _mm_set1_epi64x( 0x07B385F307B385F3 ); \
B2 = _mm_set1_epi64x( 0xE7442C26E7442C26 ); \
B3 = _mm_set1_epi64x( 0xCC8AD640CC8AD640 ); \
B4 = _mm_set1_epi64x( 0xEB6F56C7EB6F56C7 ); \
B5 = _mm_set1_epi64x( 0x1EA81AA91EA81AA9 ); \
B6 = _mm_set1_epi64x( 0x73B9D31473B9D314 ); \
B7 = _mm_set1_epi64x( 0x1DE85D081DE85D08 ); \
B8 = _mm_set1_epi64x( 0x48910A5A48910A5A ); \
B9 = _mm_set1_epi64x( 0x893B22DB893B22DB ); \
BA = _mm_set1_epi64x( 0xC5A0DF44C5A0DF44 ); \
BB = _mm_set1_epi64x( 0xBBC4324EBBC4324E ); \
BC = _mm_set1_epi64x( 0x72D2F24072D2F240 ); \
BD = _mm_set1_epi64x( 0x75941D9975941D99 ); \
BE = _mm_set1_epi64x( 0x6D8BDE826D8BDE82 ); \
BF = _mm_set1_epi64x( 0xA1A7502BA1A7502B ); \
C0 = _mm_set1_epi64x( 0xD9BF68D1D9BF68D1 ); \
C1 = _mm_set1_epi64x( 0x58BAD75058BAD750 ); \
C2 = _mm_set1_epi64x( 0x56028CB256028CB2 ); \
C3 = _mm_set1_epi64x( 0x8134F3598134F359 ); \
C4 = _mm_set1_epi64x( 0xB5D469D8B5D469D8 ); \
C5 = _mm_set1_epi64x( 0x941A8CC2941A8CC2 ); \
C6 = _mm_set1_epi64x( 0x418B2A6E418B2A6E ); \
C7 = _mm_set1_epi64x( 0x0405278004052780 ); \
C8 = _mm_set1_epi64x( 0x7F07D7877F07D787 ); \
C9 = _mm_set1_epi64x( 0x5194358F5194358F ); \
CA = _mm_set1_epi64x( 0x3C60D6653C60D665 ); \
CB = _mm_set1_epi64x( 0xBE97D79ABE97D79A ); \
CC = _mm_set1_epi64x( 0x950C3434950C3434 ); \
CD = _mm_set1_epi64x( 0xAED9A06DAED9A06D ); \
CE = _mm_set1_epi64x( 0x2537DC8D2537DC8D ); \
CF = _mm_set1_epi64x( 0x7CDB59697CDB5969 ); \
} \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
@@ -896,6 +896,16 @@ do { \
mm128_swap256_128( BF, CF ); \
} while (0)
#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm128_xor3( xm, xb1, mm128_xorandnot( \
_mm_mullo_epi32( mm128_xor3( xa0, xc, \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
xb0 = mm128_xnor( xa0, mm128_rol_32( xb0, 1 ) ); \
} while (0)
/*
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
@@ -905,6 +915,7 @@ do { \
) ), THREE ) ) ) ); \
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
} while (0)
*/
#define PERM_STEP_0 do { \
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
@@ -1078,103 +1089,103 @@ shabal_4way_init( void *cc, unsigned size )
{ // copy immediate constants directly to working registers later.
sc->state_loaded = false;
/*
sc->A[ 0] = m128_const1_64( 0x20728DFD20728DFD );
sc->A[ 1] = m128_const1_64( 0x46C0BD5346C0BD53 );
sc->A[ 2] = m128_const1_64( 0xE782B699E782B699 );
sc->A[ 3] = m128_const1_64( 0x5530463255304632 );
sc->A[ 4] = m128_const1_64( 0x71B4EF9071B4EF90 );
sc->A[ 5] = m128_const1_64( 0x0EA9E82C0EA9E82C );
sc->A[ 6] = m128_const1_64( 0xDBB930F1DBB930F1 );
sc->A[ 7] = m128_const1_64( 0xFAD06B8BFAD06B8B );
sc->A[ 8] = m128_const1_64( 0xBE0CAE40BE0CAE40 );
sc->A[ 9] = m128_const1_64( 0x8BD144108BD14410 );
sc->A[10] = m128_const1_64( 0x76D2ADAC76D2ADAC );
sc->A[11] = m128_const1_64( 0x28ACAB7F28ACAB7F );
sc->A[ 0] = _mm_set1_epi64x( 0x20728DFD20728DFD );
sc->A[ 1] = _mm_set1_epi64x( 0x46C0BD5346C0BD53 );
sc->A[ 2] = _mm_set1_epi64x( 0xE782B699E782B699 );
sc->A[ 3] = _mm_set1_epi64x( 0x5530463255304632 );
sc->A[ 4] = _mm_set1_epi64x( 0x71B4EF9071B4EF90 );
sc->A[ 5] = _mm_set1_epi64x( 0x0EA9E82C0EA9E82C );
sc->A[ 6] = _mm_set1_epi64x( 0xDBB930F1DBB930F1 );
sc->A[ 7] = _mm_set1_epi64x( 0xFAD06B8BFAD06B8B );
sc->A[ 8] = _mm_set1_epi64x( 0xBE0CAE40BE0CAE40 );
sc->A[ 9] = _mm_set1_epi64x( 0x8BD144108BD14410 );
sc->A[10] = _mm_set1_epi64x( 0x76D2ADAC76D2ADAC );
sc->A[11] = _mm_set1_epi64x( 0x28ACAB7F28ACAB7F );
sc->B[ 0] = m128_const1_64( 0xC1099CB7C1099CB7 );
sc->B[ 1] = m128_const1_64( 0x07B385F307B385F3 );
sc->B[ 2] = m128_const1_64( 0xE7442C26E7442C26 );
sc->B[ 3] = m128_const1_64( 0xCC8AD640CC8AD640 );
sc->B[ 4] = m128_const1_64( 0xEB6F56C7EB6F56C7 );
sc->B[ 5] = m128_const1_64( 0x1EA81AA91EA81AA9 );
sc->B[ 6] = m128_const1_64( 0x73B9D31473B9D314 );
sc->B[ 7] = m128_const1_64( 0x1DE85D081DE85D08 );
sc->B[ 8] = m128_const1_64( 0x48910A5A48910A5A );
sc->B[ 9] = m128_const1_64( 0x893B22DB893B22DB );
sc->B[10] = m128_const1_64( 0xC5A0DF44C5A0DF44 );
sc->B[11] = m128_const1_64( 0xBBC4324EBBC4324E );
sc->B[12] = m128_const1_64( 0x72D2F24072D2F240 );
sc->B[13] = m128_const1_64( 0x75941D9975941D99 );
sc->B[14] = m128_const1_64( 0x6D8BDE826D8BDE82 );
sc->B[15] = m128_const1_64( 0xA1A7502BA1A7502B );
sc->B[ 0] = _mm_set1_epi64x( 0xC1099CB7C1099CB7 );
sc->B[ 1] = _mm_set1_epi64x( 0x07B385F307B385F3 );
sc->B[ 2] = _mm_set1_epi64x( 0xE7442C26E7442C26 );
sc->B[ 3] = _mm_set1_epi64x( 0xCC8AD640CC8AD640 );
sc->B[ 4] = _mm_set1_epi64x( 0xEB6F56C7EB6F56C7 );
sc->B[ 5] = _mm_set1_epi64x( 0x1EA81AA91EA81AA9 );
sc->B[ 6] = _mm_set1_epi64x( 0x73B9D31473B9D314 );
sc->B[ 7] = _mm_set1_epi64x( 0x1DE85D081DE85D08 );
sc->B[ 8] = _mm_set1_epi64x( 0x48910A5A48910A5A );
sc->B[ 9] = _mm_set1_epi64x( 0x893B22DB893B22DB );
sc->B[10] = _mm_set1_epi64x( 0xC5A0DF44C5A0DF44 );
sc->B[11] = _mm_set1_epi64x( 0xBBC4324EBBC4324E );
sc->B[12] = _mm_set1_epi64x( 0x72D2F24072D2F240 );
sc->B[13] = _mm_set1_epi64x( 0x75941D9975941D99 );
sc->B[14] = _mm_set1_epi64x( 0x6D8BDE826D8BDE82 );
sc->B[15] = _mm_set1_epi64x( 0xA1A7502BA1A7502B );
sc->C[ 0] = m128_const1_64( 0xD9BF68D1D9BF68D1 );
sc->C[ 1] = m128_const1_64( 0x58BAD75058BAD750 );
sc->C[ 2] = m128_const1_64( 0x56028CB256028CB2 );
sc->C[ 3] = m128_const1_64( 0x8134F3598134F359 );
sc->C[ 4] = m128_const1_64( 0xB5D469D8B5D469D8 );
sc->C[ 5] = m128_const1_64( 0x941A8CC2941A8CC2 );
sc->C[ 6] = m128_const1_64( 0x418B2A6E418B2A6E );
sc->C[ 7] = m128_const1_64( 0x0405278004052780 );
sc->C[ 8] = m128_const1_64( 0x7F07D7877F07D787 );
sc->C[ 9] = m128_const1_64( 0x5194358F5194358F );
sc->C[10] = m128_const1_64( 0x3C60D6653C60D665 );
sc->C[11] = m128_const1_64( 0xBE97D79ABE97D79A );
sc->C[12] = m128_const1_64( 0x950C3434950C3434 );
sc->C[13] = m128_const1_64( 0xAED9A06DAED9A06D );
sc->C[14] = m128_const1_64( 0x2537DC8D2537DC8D );
sc->C[15] = m128_const1_64( 0x7CDB59697CDB5969 );
sc->C[ 0] = _mm_set1_epi64x( 0xD9BF68D1D9BF68D1 );
sc->C[ 1] = _mm_set1_epi64x( 0x58BAD75058BAD750 );
sc->C[ 2] = _mm_set1_epi64x( 0x56028CB256028CB2 );
sc->C[ 3] = _mm_set1_epi64x( 0x8134F3598134F359 );
sc->C[ 4] = _mm_set1_epi64x( 0xB5D469D8B5D469D8 );
sc->C[ 5] = _mm_set1_epi64x( 0x941A8CC2941A8CC2 );
sc->C[ 6] = _mm_set1_epi64x( 0x418B2A6E418B2A6E );
sc->C[ 7] = _mm_set1_epi64x( 0x0405278004052780 );
sc->C[ 8] = _mm_set1_epi64x( 0x7F07D7877F07D787 );
sc->C[ 9] = _mm_set1_epi64x( 0x5194358F5194358F );
sc->C[10] = _mm_set1_epi64x( 0x3C60D6653C60D665 );
sc->C[11] = _mm_set1_epi64x( 0xBE97D79ABE97D79A );
sc->C[12] = _mm_set1_epi64x( 0x950C3434950C3434 );
sc->C[13] = _mm_set1_epi64x( 0xAED9A06DAED9A06D );
sc->C[14] = _mm_set1_epi64x( 0x2537DC8D2537DC8D );
sc->C[15] = _mm_set1_epi64x( 0x7CDB59697CDB5969 );
*/
}
else
{ // No users
sc->state_loaded = true;
sc->A[ 0] = m128_const1_64( 0x52F8455252F84552 );
sc->A[ 1] = m128_const1_64( 0xE54B7999E54B7999 );
sc->A[ 2] = m128_const1_64( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = m128_const1_64( 0xB9645191B9645191 );
sc->A[ 4] = m128_const1_64( 0xE0078B86E0078B86 );
sc->A[ 5] = m128_const1_64( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = m128_const1_64( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = m128_const1_64( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = m128_const1_64( 0x14CE5A4514CE5A45 );
sc->A[ 9] = m128_const1_64( 0x22AF50DC22AF50DC );
sc->A[10] = m128_const1_64( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = m128_const1_64( 0xEB21B74AEB21B74A );
sc->A[ 0] = _mm_set1_epi64x( 0x52F8455252F84552 );
sc->A[ 1] = _mm_set1_epi64x( 0xE54B7999E54B7999 );
sc->A[ 2] = _mm_set1_epi64x( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = _mm_set1_epi64x( 0xB9645191B9645191 );
sc->A[ 4] = _mm_set1_epi64x( 0xE0078B86E0078B86 );
sc->A[ 5] = _mm_set1_epi64x( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = _mm_set1_epi64x( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = _mm_set1_epi64x( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = _mm_set1_epi64x( 0x14CE5A4514CE5A45 );
sc->A[ 9] = _mm_set1_epi64x( 0x22AF50DC22AF50DC );
sc->A[10] = _mm_set1_epi64x( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = _mm_set1_epi64x( 0xEB21B74AEB21B74A );
sc->B[ 0] = m128_const1_64( 0xB555C6EEB555C6EE );
sc->B[ 1] = m128_const1_64( 0x3E7105963E710596 );
sc->B[ 2] = m128_const1_64( 0xA72A652FA72A652F );
sc->B[ 3] = m128_const1_64( 0x9301515F9301515F );
sc->B[ 4] = m128_const1_64( 0xDA28C1FADA28C1FA );
sc->B[ 5] = m128_const1_64( 0x696FD868696FD868 );
sc->B[ 6] = m128_const1_64( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = m128_const1_64( 0x0AFE40020AFE4002 );
sc->B[ 8] = m128_const1_64( 0xA6E03615A6E03615 );
sc->B[ 9] = m128_const1_64( 0x5138C1D45138C1D4 );
sc->B[10] = m128_const1_64( 0xBE216306BE216306 );
sc->B[11] = m128_const1_64( 0xB38B8890B38B8890 );
sc->B[12] = m128_const1_64( 0x3EA8B96B3EA8B96B );
sc->B[13] = m128_const1_64( 0x3299ACE43299ACE4 );
sc->B[14] = m128_const1_64( 0x30924DD430924DD4 );
sc->B[15] = m128_const1_64( 0x55CB34A555CB34A5 );
sc->B[ 0] = _mm_set1_epi64x( 0xB555C6EEB555C6EE );
sc->B[ 1] = _mm_set1_epi64x( 0x3E7105963E710596 );
sc->B[ 2] = _mm_set1_epi64x( 0xA72A652FA72A652F );
sc->B[ 3] = _mm_set1_epi64x( 0x9301515F9301515F );
sc->B[ 4] = _mm_set1_epi64x( 0xDA28C1FADA28C1FA );
sc->B[ 5] = _mm_set1_epi64x( 0x696FD868696FD868 );
sc->B[ 6] = _mm_set1_epi64x( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = _mm_set1_epi64x( 0x0AFE40020AFE4002 );
sc->B[ 8] = _mm_set1_epi64x( 0xA6E03615A6E03615 );
sc->B[ 9] = _mm_set1_epi64x( 0x5138C1D45138C1D4 );
sc->B[10] = _mm_set1_epi64x( 0xBE216306BE216306 );
sc->B[11] = _mm_set1_epi64x( 0xB38B8890B38B8890 );
sc->B[12] = _mm_set1_epi64x( 0x3EA8B96B3EA8B96B );
sc->B[13] = _mm_set1_epi64x( 0x3299ACE43299ACE4 );
sc->B[14] = _mm_set1_epi64x( 0x30924DD430924DD4 );
sc->B[15] = _mm_set1_epi64x( 0x55CB34A555CB34A5 );
sc->C[ 0] = m128_const1_64( 0xB405F031B405F031 );
sc->C[ 1] = m128_const1_64( 0xC4233EBAC4233EBA );
sc->C[ 2] = m128_const1_64( 0xB3733979B3733979 );
sc->C[ 3] = m128_const1_64( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = m128_const1_64( 0xC51C28AEC51C28AE );
sc->C[ 5] = m128_const1_64( 0xA327B8E1A327B8E1 );
sc->C[ 6] = m128_const1_64( 0x56C5616756C56167 );
sc->C[ 7] = m128_const1_64( 0xED614433ED614433 );
sc->C[ 8] = m128_const1_64( 0x88B59D6088B59D60 );
sc->C[ 9] = m128_const1_64( 0x60E2CEBA60E2CEBA );
sc->C[10] = m128_const1_64( 0x758B4B8B758B4B8B );
sc->C[11] = m128_const1_64( 0x83E82A7F83E82A7F );
sc->C[12] = m128_const1_64( 0xBC968828BC968828 );
sc->C[13] = m128_const1_64( 0xE6E00BF7E6E00BF7 );
sc->C[14] = m128_const1_64( 0xBA839E55BA839E55 );
sc->C[15] = m128_const1_64( 0x9B491C609B491C60 );
sc->C[ 0] = _mm_set1_epi64x( 0xB405F031B405F031 );
sc->C[ 1] = _mm_set1_epi64x( 0xC4233EBAC4233EBA );
sc->C[ 2] = _mm_set1_epi64x( 0xB3733979B3733979 );
sc->C[ 3] = _mm_set1_epi64x( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = _mm_set1_epi64x( 0xC51C28AEC51C28AE );
sc->C[ 5] = _mm_set1_epi64x( 0xA327B8E1A327B8E1 );
sc->C[ 6] = _mm_set1_epi64x( 0x56C5616756C56167 );
sc->C[ 7] = _mm_set1_epi64x( 0xED614433ED614433 );
sc->C[ 8] = _mm_set1_epi64x( 0x88B59D6088B59D60 );
sc->C[ 9] = _mm_set1_epi64x( 0x60E2CEBA60E2CEBA );
sc->C[10] = _mm_set1_epi64x( 0x758B4B8B758B4B8B );
sc->C[11] = _mm_set1_epi64x( 0x83E82A7F83E82A7F );
sc->C[12] = _mm_set1_epi64x( 0xBC968828BC968828 );
sc->C[13] = _mm_set1_epi64x( 0xE6E00BF7E6E00BF7 );
sc->C[14] = _mm_set1_epi64x( 0xBA839E55BA839E55 );
sc->C[15] = _mm_set1_epi64x( 0x9B491C609B491C60 );
}
sc->Wlow = 1;
sc->Whigh = 0;

View File

@@ -32,6 +32,44 @@ static const uint32_t IV512[] =
#endif
#if defined (__AVX512VL__)
//TODO Enable for AVX10_256
#define DECL_m256i_count \
const __m256i count = \
mm256_set4_32( ctx->count3, ctx->count2, ctx->count1, ctx->count0 );
#define COUNT_R0 \
_mm256_mask_xor_epi32( count, 0x88, count, m256_neg1 )
#define COUNT_R1 \
mm256_shuflr128_32( _mm256_mask_xor_epi32( count, 0x11, count, m256_neg1 ) )
#define COUNT_R2 \
mm256_swap128_64( _mm256_mask_xor_epi32( count, 0x22, count, m256_neg1 ) )
#define COUNT_R13 \
mm256_swap64_32( _mm256_mask_xor_epi32( count, 0x44, count, m256_neg1 ) )
#else
#define DECL_m256i_count
// R matches the loop index not the round number, should changet that
#define COUNT_R0 \
mm256_set4_32( ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 )
#define COUNT_R1 \
mm256_set4_32( ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 )
#define COUNT_R2 \
mm256_set4_32( ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 )
#define COUNT_R13 \
mm256_set4_32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 )
#endif
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
@@ -40,6 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
__m256i *h = (__m256i*)ctx->h;
DECL_m256i_count;
int r;
p0 = h[0];
@@ -47,7 +86,8 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
p2 = h[2];
p3 = h[3];
// round
// round 0
k00 = m[0];
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
k01 = m[1];
@@ -78,18 +118,14 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
mm256_aesenc_2x128( k00, zero ) ) );
if ( r == 0 )
k00 = _mm256_xor_si256( k00, _mm256_set_epi32(
~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
k00 = _mm256_xor_si256( k00, COUNT_R0 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( k00,
mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
if ( r == 1 )
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
k01 = _mm256_xor_si256( k01, COUNT_R1 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k01,
@@ -114,9 +150,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
if ( r == 2 )
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
k13 = _mm256_xor_si256( k13, COUNT_R2 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p1 = _mm256_xor_si256( p1, x );
@@ -228,9 +262,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, COUNT_R13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_shuflr128_32(

View File

@@ -204,11 +204,9 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, mm512_swap64_32(
_mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7= _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );

View File

@@ -212,14 +212,24 @@ do { \
// targetted
#define shufxor2w(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, m256_const1_64( \
_mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
_mm256_srai_epi16( x, 8 ) )
#else
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
#endif
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \
m256_const1_64( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, m256_const1_64( 0x0080008000800080 ) ) ) )
_mm256_set1_epi64x( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -387,17 +397,11 @@ static const m512_v16 FFT256_Twiddle4w[] =
_mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
_mm512_srai_epi16( x, 8 ) )
/*
#define REDUCE4w(x) \
_mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
*/
#define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \
m512_const1_64( 0x0101010101010101 ), \
_mm512_set1_epi64( 0x0101010101010101 ), \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
x, m512_const1_64( 0x0080008000800080 ) ) ) ) )
x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
// generic, except it calls targetted macros
#define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) )

View File

@@ -63,7 +63,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
@@ -151,7 +151,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

View File

@@ -285,7 +285,7 @@ static const uint64_t IV512[] = {
#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
#define READ_STATE_BIG(sc) do { \
#define READ_STATE_BIG(sc) \
h0 = (sc)->h0; \
h1 = (sc)->h1; \
h2 = (sc)->h2; \
@@ -294,10 +294,9 @@ static const uint64_t IV512[] = {
h5 = (sc)->h5; \
h6 = (sc)->h6; \
h7 = (sc)->h7; \
bcount = sc->bcount; \
} while (0)
bcount = sc->bcount;
#define WRITE_STATE_BIG(sc) do { \
#define WRITE_STATE_BIG(sc) \
(sc)->h0 = h0; \
(sc)->h1 = h1; \
(sc)->h2 = h2; \
@@ -306,62 +305,54 @@ static const uint64_t IV512[] = {
(sc)->h5 = h5; \
(sc)->h6 = h6; \
(sc)->h7 = h7; \
sc->bcount = bcount; \
} while (0)
sc->bcount = bcount;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
do { \
k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
t2 = t0 ^ t1; \
} while (0)
k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), \
mm512_xor3( k3, k4, k5 ), \
mm512_xor3( k6, k7, \
_mm512_set1_epi64( 0x1BD11BDAA9FC1A22) ) ); \
t2 = t0 ^ t1;
#define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
do { \
w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \
w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \
w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \
w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \
w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \
m512_const1_64( SKBT(t,s,0) ) ) ); \
_mm512_set1_epi64( SKBT(t,s,0) ) ) ); \
w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \
m512_const1_64( SKBT(t,s,1) ) ) ); \
_mm512_set1_epi64( SKBT(t,s,1) ) ) ); \
w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \
m512_const1_64( s ) ) ); \
} while (0)
_mm512_set1_epi64( s ) ) );
#define TFBIG_MIX_8WAY(x0, x1, rc) \
do { \
x0 = _mm512_add_epi64( x0, x1 ); \
x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \
} while (0)
x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 );
#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) \
TFBIG_MIX_8WAY(w0, w1, rc0); \
TFBIG_MIX_8WAY(w2, w3, rc1); \
TFBIG_MIX_8WAY(w4, w5, rc2); \
TFBIG_MIX_8WAY(w6, w7, rc3); \
} while (0)
TFBIG_MIX_8WAY(w6, w7, rc3);
#define TFBIG_8WAY_4e(s) do { \
#define TFBIG_8WAY_4e(s) \
TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
} while (0)
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
#define TFBIG_8WAY_4o(s) do { \
#define TFBIG_8WAY_4o(s) \
TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
} while (0)
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
#define UBI_BIG_8WAY(etype, extra) \
do { \
@@ -424,59 +415,48 @@ do { \
#endif // AVX512
#define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
do { \
k8 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( _mm256_xor_si256( k0, k1 ), \
_mm256_xor_si256( k2, k3 ) ), \
_mm256_xor_si256( _mm256_xor_si256( k4, k5 ), \
_mm256_xor_si256( k6, k7 ) ) ), \
m256_const1_64( 0x1BD11BDAA9FC1A22) ); \
t2 = t0 ^ t1; \
} while (0)
k8 = mm256_xor3( mm256_xor3( k0, k1, k2 ), \
mm256_xor3( k3, k4, k5 ), \
mm256_xor3( k6, k7, \
_mm256_set1_epi64x( 0x1BD11BDAA9FC1A22) ) ); \
t2 = t0 ^ t1;
#define TFBIG_ADDKEY_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
do { \
w0 = _mm256_add_epi64( w0, SKBI(k,s,0) ); \
w1 = _mm256_add_epi64( w1, SKBI(k,s,1) ); \
w2 = _mm256_add_epi64( w2, SKBI(k,s,2) ); \
w3 = _mm256_add_epi64( w3, SKBI(k,s,3) ); \
w4 = _mm256_add_epi64( w4, SKBI(k,s,4) ); \
w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
m256_const1_64( SKBT(t,s,0) ) ) ); \
_mm256_set1_epi64x( SKBT(t,s,0) ) ) ); \
w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
m256_const1_64( SKBT(t,s,1) ) ) ); \
_mm256_set1_epi64x( SKBT(t,s,1) ) ) ); \
w7 = _mm256_add_epi64( w7, _mm256_add_epi64( SKBI(k,s,7), \
m256_const1_64( s ) ) ); \
} while (0)
_mm256_set1_epi64x( s ) ) );
#define TFBIG_MIX_4WAY(x0, x1, rc) \
do { \
x0 = _mm256_add_epi64( x0, x1 ); \
x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \
} while (0)
x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 );
#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) \
TFBIG_MIX_4WAY(w0, w1, rc0); \
TFBIG_MIX_4WAY(w2, w3, rc1); \
TFBIG_MIX_4WAY(w4, w5, rc2); \
TFBIG_MIX_4WAY(w6, w7, rc3); \
} while (0)
TFBIG_MIX_4WAY(w6, w7, rc3);
#define TFBIG_4WAY_4e(s) do { \
#define TFBIG_4WAY_4e(s) \
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
} while (0)
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
#define TFBIG_4WAY_4o(s) do { \
#define TFBIG_4WAY_4o(s) \
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
} while (0)
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
// scale buf offset by 4
#define UBI_BIG_4WAY(etype, extra) \
@@ -541,28 +521,28 @@ do { \
void skein256_8way_init( skein256_8way_context *sc )
{
sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 );
sc->h1 = m512_const1_64( 0xE83590301A79A9EB );
sc->h2 = m512_const1_64( 0x55AEA0614F816E6F );
sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB );
sc->h4 = m512_const1_64( 0xEC06025E74DD7683 );
sc->h5 = m512_const1_64( 0xE7A436CDC4746251 );
sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 );
sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 );
sc->h0 = _mm512_set1_epi64( 0xCCD044A12FDB3E13 );
sc->h1 = _mm512_set1_epi64( 0xE83590301A79A9EB );
sc->h2 = _mm512_set1_epi64( 0x55AEA0614F816E6F );
sc->h3 = _mm512_set1_epi64( 0x2A2767A4AE9B94DB );
sc->h4 = _mm512_set1_epi64( 0xEC06025E74DD7683 );
sc->h5 = _mm512_set1_epi64( 0xE7A436CDC4746251 );
sc->h6 = _mm512_set1_epi64( 0xC36FBAF9393AD185 );
sc->h7 = _mm512_set1_epi64( 0x3EEDBA1833EDFC13 );
sc->bcount = 0;
sc->ptr = 0;
}
void skein512_8way_init( skein512_8way_context *sc )
{
sc->h0 = m512_const1_64( 0x4903ADFF749C51CE );
sc->h1 = m512_const1_64( 0x0D95DE399746DF03 );
sc->h2 = m512_const1_64( 0x8FD1934127C79BCE );
sc->h3 = m512_const1_64( 0x9A255629FF352CB1 );
sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
sc->h6 = m512_const1_64( 0x991112C71A75B523 );
sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 );
sc->h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
sc->h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
sc->h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
sc->h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
sc->h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
sc->h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
sc->h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
sc->h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
sc->bcount = 0;
sc->ptr = 0;
}
@@ -660,14 +640,14 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
// Init
h0 = m512_const1_64( 0x4903ADFF749C51CE );
h1 = m512_const1_64( 0x0D95DE399746DF03 );
h2 = m512_const1_64( 0x8FD1934127C79BCE );
h3 = m512_const1_64( 0x9A255629FF352CB1 );
h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
h6 = m512_const1_64( 0x991112C71A75B523 );
h7 = m512_const1_64( 0xAE18A40B660FCC33 );
h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
// Update
@@ -734,14 +714,14 @@ skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
buf[5] = vdata[5];
buf[6] = vdata[6];
buf[7] = vdata[7];
register __m512i h0 = m512_const1_64( 0x4903ADFF749C51CE );
register __m512i h1 = m512_const1_64( 0x0D95DE399746DF03 );
register __m512i h2 = m512_const1_64( 0x8FD1934127C79BCE );
register __m512i h3 = m512_const1_64( 0x9A255629FF352CB1 );
register __m512i h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
register __m512i h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
register __m512i h6 = m512_const1_64( 0x991112C71A75B523 );
register __m512i h7 = m512_const1_64( 0xAE18A40B660FCC33 );
register __m512i h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
register __m512i h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
register __m512i h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
register __m512i h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
register __m512i h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
register __m512i h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
register __m512i h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
register __m512i h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
uint64_t bcount = 1;
UBI_BIG_8WAY( 224, 0 );
@@ -830,28 +810,28 @@ skein512_8way_close(void *cc, void *dst)
void skein256_4way_init( skein256_4way_context *sc )
{
sc->h0 = m256_const1_64( 0xCCD044A12FDB3E13 );
sc->h1 = m256_const1_64( 0xE83590301A79A9EB );
sc->h2 = m256_const1_64( 0x55AEA0614F816E6F );
sc->h3 = m256_const1_64( 0x2A2767A4AE9B94DB );
sc->h4 = m256_const1_64( 0xEC06025E74DD7683 );
sc->h5 = m256_const1_64( 0xE7A436CDC4746251 );
sc->h6 = m256_const1_64( 0xC36FBAF9393AD185 );
sc->h7 = m256_const1_64( 0x3EEDBA1833EDFC13 );
sc->h0 = _mm256_set1_epi64x( 0xCCD044A12FDB3E13 );
sc->h1 = _mm256_set1_epi64x( 0xE83590301A79A9EB );
sc->h2 = _mm256_set1_epi64x( 0x55AEA0614F816E6F );
sc->h3 = _mm256_set1_epi64x( 0x2A2767A4AE9B94DB );
sc->h4 = _mm256_set1_epi64x( 0xEC06025E74DD7683 );
sc->h5 = _mm256_set1_epi64x( 0xE7A436CDC4746251 );
sc->h6 = _mm256_set1_epi64x( 0xC36FBAF9393AD185 );
sc->h7 = _mm256_set1_epi64x( 0x3EEDBA1833EDFC13 );
sc->bcount = 0;
sc->ptr = 0;
}
void skein512_4way_init( skein512_4way_context *sc )
{
sc->h0 = m256_const1_64( 0x4903ADFF749C51CE );
sc->h1 = m256_const1_64( 0x0D95DE399746DF03 );
sc->h2 = m256_const1_64( 0x8FD1934127C79BCE );
sc->h3 = m256_const1_64( 0x9A255629FF352CB1 );
sc->h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
sc->h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
sc->h6 = m256_const1_64( 0x991112C71A75B523 );
sc->h7 = m256_const1_64( 0xAE18A40B660FCC33 );
sc->h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
sc->h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
sc->h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
sc->h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
sc->h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
sc->h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
sc->h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
sc->h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
sc->bcount = 0;
sc->ptr = 0;
}
@@ -954,14 +934,14 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
const int buf_size = 64; // 64 * __m256i
uint64_t bcount = 0;
h0 = m256_const1_64( 0x4903ADFF749C51CE );
h1 = m256_const1_64( 0x0D95DE399746DF03 );
h2 = m256_const1_64( 0x8FD1934127C79BCE );
h3 = m256_const1_64( 0x9A255629FF352CB1 );
h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
h6 = m256_const1_64( 0x991112C71A75B523 );
h7 = m256_const1_64( 0xAE18A40B660FCC33 );
h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
// Update
@@ -1028,14 +1008,14 @@ skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
buf[5] = vdata[5];
buf[6] = vdata[6];
buf[7] = vdata[7];
register __m256i h0 = m256_const1_64( 0x4903ADFF749C51CE );
register __m256i h1 = m256_const1_64( 0x0D95DE399746DF03 );
register __m256i h2 = m256_const1_64( 0x8FD1934127C79BCE );
register __m256i h3 = m256_const1_64( 0x9A255629FF352CB1 );
register __m256i h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
register __m256i h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
register __m256i h6 = m256_const1_64( 0x991112C71A75B523 );
register __m256i h7 = m256_const1_64( 0xAE18A40B660FCC33 );
register __m256i h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
register __m256i h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
register __m256i h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
register __m256i h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
register __m256i h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
register __m256i h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
register __m256i h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
register __m256i h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
uint64_t bcount = 1;
UBI_BIG_4WAY( 224, 0 );

View File

@@ -57,7 +57,7 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
@@ -119,7 +119,7 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );

View File

@@ -630,36 +630,35 @@ void InitializeSWIFFTX()
}
// In the original code the F matrix is rotated so it was not aranged
// the same as all the other data. Rearanging F to match all the other
// data made vectorizing possible, the compiler probably could have been
// able to auto-vectorize with proper data organisation.
// Also in the original code the custom 16 bit data types are all now 32
// bit int32_t regardless of the type name.
//
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
// the same as the other data. Rearanging F made vectorizing up to 256 bits
// possible.
// Also in the original code the custom 16 bit data types are all now aliased
// to 32 bit int32_t.
void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
{
#if defined(__AVX2__)
__m256i F[8] __attribute__ ((aligned (64)));
__m256i F0, F1, F2, F3, F4, F5, F6, F7;
__m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
__m256i *mul = (__m256i*)multipliers;
__m256i *out = (__m256i*)output;
__m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
F[0] = _mm256_mullo_epi32( mul[0], *tbl );
tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
F[1] = _mm256_mullo_epi32( mul[1], *tbl );
tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
F[2] = _mm256_mullo_epi32( mul[2], *tbl );
tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
F[3] = _mm256_mullo_epi32( mul[3], *tbl );
tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
F[4] = _mm256_mullo_epi32( mul[4], *tbl );
tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
F[5] = _mm256_mullo_epi32( mul[5], *tbl );
tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
F[6] = _mm256_mullo_epi32( mul[6], *tbl );
tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
F[7] = _mm256_mullo_epi32( mul[7], *tbl );
F0 = _mm256_mullo_epi32( mul[0], tbl );
tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
F1 = _mm256_mullo_epi32( mul[1], tbl );
tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
F2 = _mm256_mullo_epi32( mul[2], tbl );
tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
F3 = _mm256_mullo_epi32( mul[3], tbl );
tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
F4 = _mm256_mullo_epi32( mul[4], tbl );
tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
F5 = _mm256_mullo_epi32( mul[5], tbl );
tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
F6 = _mm256_mullo_epi32( mul[6], tbl );
tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
F7 = _mm256_mullo_epi32( mul[7], tbl );
#define ADD_SUB( a, b ) \
{ \
@@ -668,52 +667,50 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
a = _mm256_add_epi32( a, tmp ); \
}
ADD_SUB( F[0], F[1] );
ADD_SUB( F[2], F[3] );
ADD_SUB( F[4], F[5] );
ADD_SUB( F[6], F[7] );
F[3] = _mm256_slli_epi32( F[3], 4 );
F[7] = _mm256_slli_epi32( F[7], 4 );
ADD_SUB( F[0], F[2] );
ADD_SUB( F[1], F[3] );
ADD_SUB( F[4], F[6] );
ADD_SUB( F[5], F[7] );
F[5] = _mm256_slli_epi32( F[5], 2 );
F[6] = _mm256_slli_epi32( F[6], 4 );
F[7] = _mm256_slli_epi32( F[7], 6 );
ADD_SUB( F[0], F[4] );
ADD_SUB( F[1], F[5] );
ADD_SUB( F[2], F[6] );
ADD_SUB( F[3], F[7] );
ADD_SUB( F0, F1 );
ADD_SUB( F2, F3 );
ADD_SUB( F4, F5 );
ADD_SUB( F6, F7 );
F3 = _mm256_slli_epi32( F3, 4 );
F7 = _mm256_slli_epi32( F7, 4 );
ADD_SUB( F0, F2 );
ADD_SUB( F1, F3 );
ADD_SUB( F4, F6 );
ADD_SUB( F5, F7 );
F5 = _mm256_slli_epi32( F5, 2 );
F6 = _mm256_slli_epi32( F6, 4 );
F7 = _mm256_slli_epi32( F7, 6 );
ADD_SUB( F0, F4 );
ADD_SUB( F1, F5 );
ADD_SUB( F2, F6 );
ADD_SUB( F3, F7 );
#undef ADD_SUB
#if defined (__AVX512VL__) && defined(__AVX512BW__)
const __m256i mask = _mm256_movm_epi8( 0x11111111 );
#define Q_REDUCE( a ) \
_mm256_sub_epi32( _mm256_maskz_mov_epi8( 0x11111111, a ), \
_mm256_srai_epi32( a, 8 ) )
#else
const __m256i mask = m256_const1_32( 0x000000ff );
#endif
const __m256i mask = _mm256_set1_epi32( 0x000000ff );
#define Q_REDUCE( a ) \
_mm256_sub_epi32( _mm256_and_si256( a, mask ), \
_mm256_srai_epi32( a, 8 ) )
#endif
out[0] = Q_REDUCE( F[0] );
out[1] = Q_REDUCE( F[1] );
out[2] = Q_REDUCE( F[2] );
out[3] = Q_REDUCE( F[3] );
out[4] = Q_REDUCE( F[4] );
out[5] = Q_REDUCE( F[5] );
out[6] = Q_REDUCE( F[6] );
out[7] = Q_REDUCE( F[7] );
out[0] = Q_REDUCE( F0 );
out[1] = Q_REDUCE( F1 );
out[2] = Q_REDUCE( F2 );
out[3] = Q_REDUCE( F3 );
out[4] = Q_REDUCE( F4 );
out[5] = Q_REDUCE( F5 );
out[6] = Q_REDUCE( F6 );
out[7] = Q_REDUCE( F7 );
#undef Q_REDUCE
@@ -763,12 +760,10 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
ADD_SUB( F[ 9], F[11] );
ADD_SUB( F[12], F[14] );
ADD_SUB( F[13], F[15] );
F[ 6] = _mm_slli_epi32( F[ 6], 4 );
F[ 7] = _mm_slli_epi32( F[ 7], 4 );
F[14] = _mm_slli_epi32( F[14], 4 );
F[15] = _mm_slli_epi32( F[15], 4 );
ADD_SUB( F[ 0], F[ 4] );
ADD_SUB( F[ 1], F[ 5] );
ADD_SUB( F[ 2], F[ 6] );
@@ -777,14 +772,12 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
ADD_SUB( F[ 9], F[13] );
ADD_SUB( F[10], F[14] );
ADD_SUB( F[11], F[15] );
F[10] = _mm_slli_epi32( F[10], 2 );
F[11] = _mm_slli_epi32( F[11], 2 );
F[12] = _mm_slli_epi32( F[12], 4 );
F[13] = _mm_slli_epi32( F[13], 4 );
F[14] = _mm_slli_epi32( F[14], 6 );
F[15] = _mm_slli_epi32( F[15], 6 );
ADD_SUB( F[ 0], F[ 8] );
ADD_SUB( F[ 1], F[ 9] );
ADD_SUB( F[ 2], F[10] );
@@ -796,7 +789,7 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
#undef ADD_SUB
const __m128i mask = m128_const1_32( 0x000000ff );
const __m128i mask = _mm_set1_epi32( 0x000000ff );
#define Q_REDUCE( a ) \
_mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) )
@@ -820,16 +813,13 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
#undef Q_REDUCE
#else // < SSE4.1
#else // AVX256 elif SSE4_1
swift_int16_t *mult = multipliers;
// First loop unrolling:
register swift_int16_t *table = &(fftTable[input[0] << 3]);
/*
swift_int16_t *table = &( fftTable[ input[0] << 3 ] );
swift_int32_t F[64];
/*
for (int i = 0; i < 8; i++)
{
int j = i<<3;
@@ -845,99 +835,91 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
}
*/
register swift_int32_t F0, F1, F2, F3, F4, F5, F6, F7, F8, F9,
F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
F20, F21, F22, F23, F24, F25, F26, F27, F28, F29,
F30, F31, F32, F33, F34, F35, F36, F37, F38, F39,
F40, F41, F42, F43, F44, F45, F46, F47, F48, F49,
F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
F60, F61, F62, F63;
F0 = mult[0] * table[0];
F8 = mult[1] * table[1];
F16 = mult[2] * table[2];
F24 = mult[3] * table[3];
F32 = mult[4] * table[4];
F40 = mult[5] * table[5];
F48 = mult[6] * table[6];
F56 = mult[7] * table[7];
F[ 0] = mult[ 0] * table[0];
F[ 8] = mult[ 1] * table[1];
F[16] = mult[ 2] * table[2];
F[24] = mult[ 3] * table[3];
F[32] = mult[ 4] * table[4];
F[40] = mult[ 5] * table[5];
F[48] = mult[ 6] * table[6];
F[56] = mult[ 7] * table[7];
table = &(fftTable[input[1] << 3]);
F1 = mult[ 8] * table[0];
F9 = mult[ 9] * table[1];
F17 = mult[10] * table[2];
F25 = mult[11] * table[3];
F33 = mult[12] * table[4];
F41 = mult[13] * table[5];
F49 = mult[14] * table[6];
F57 = mult[15] * table[7];
F[ 1] = mult[ 8] * table[0];
F[ 9] = mult[ 9] * table[1];
F[17] = mult[10] * table[2];
F[25] = mult[11] * table[3];
F[33] = mult[12] * table[4];
F[41] = mult[13] * table[5];
F[49] = mult[14] * table[6];
F[57] = mult[15] * table[7];
table = &(fftTable[input[2] << 3]);
F2 = mult[16] * table[0];
F10 = mult[17] * table[1];
F18 = mult[18] * table[2];
F26 = mult[19] * table[3];
F34 = mult[20] * table[4];
F42 = mult[21] * table[5];
F50 = mult[22] * table[6];
F58 = mult[23] * table[7];
F[ 2] = mult[16] * table[0];
F[10] = mult[17] * table[1];
F[18] = mult[18] * table[2];
F[26] = mult[19] * table[3];
F[34] = mult[20] * table[4];
F[42] = mult[21] * table[5];
F[50] = mult[22] * table[6];
F[58] = mult[23] * table[7];
table = &(fftTable[input[3] << 3]);
F3 = mult[24] * table[0];
F11 = mult[25] * table[1];
F19 = mult[26] * table[2];
F27 = mult[27] * table[3];
F35 = mult[28] * table[4];
F43 = mult[29] * table[5];
F51 = mult[30] * table[6];
F59 = mult[31] * table[7];
F[ 3] = mult[24] * table[0];
F[11] = mult[25] * table[1];
F[19] = mult[26] * table[2];
F[27] = mult[27] * table[3];
F[35] = mult[28] * table[4];
F[43] = mult[29] * table[5];
F[51] = mult[30] * table[6];
F[59] = mult[31] * table[7];
table = &(fftTable[input[4] << 3]);
F4 = mult[32] * table[0];
F12 = mult[33] * table[1];
F20 = mult[34] * table[2];
F28 = mult[35] * table[3];
F36 = mult[36] * table[4];
F44 = mult[37] * table[5];
F52 = mult[38] * table[6];
F60 = mult[39] * table[7];
F[ 4] = mult[32] * table[0];
F[12] = mult[33] * table[1];
F[20] = mult[34] * table[2];
F[28] = mult[35] * table[3];
F[36] = mult[36] * table[4];
F[44] = mult[37] * table[5];
F[52] = mult[38] * table[6];
F[60] = mult[39] * table[7];
table = &(fftTable[input[5] << 3]);
F5 = mult[40] * table[0];
F13 = mult[41] * table[1];
F21 = mult[42] * table[2];
F29 = mult[43] * table[3];
F37 = mult[44] * table[4];
F45 = mult[45] * table[5];
F53 = mult[46] * table[6];
F61 = mult[47] * table[7];
F[ 5] = mult[40] * table[0];
F[13] = mult[41] * table[1];
F[21] = mult[42] * table[2];
F[29] = mult[43] * table[3];
F[37] = mult[44] * table[4];
F[45] = mult[45] * table[5];
F[53] = mult[46] * table[6];
F[61] = mult[47] * table[7];
table = &(fftTable[input[6] << 3]);
F6 = mult[48] * table[0];
F14 = mult[49] * table[1];
F22 = mult[50] * table[2];
F30 = mult[51] * table[3];
F38 = mult[52] * table[4];
F46 = mult[53] * table[5];
F54 = mult[54] * table[6];
F62 = mult[55] * table[7];
F[ 6] = mult[48] * table[0];
F[14] = mult[49] * table[1];
F[22] = mult[50] * table[2];
F[30] = mult[51] * table[3];
F[38] = mult[52] * table[4];
F[46] = mult[53] * table[5];
F[54] = mult[54] * table[6];
F[62] = mult[55] * table[7];
table = &(fftTable[input[7] << 3]);
F7 = mult[56] * table[0];
F15 = mult[57] * table[1];
F23 = mult[58] * table[2];
F31 = mult[59] * table[3];
F39 = mult[60] * table[4];
F47 = mult[61] * table[5];
F55 = mult[62] * table[6];
F63 = mult[63] * table[7];
F[ 7] = mult[56] * table[0];
F[15] = mult[57] * table[1];
F[23] = mult[58] * table[2];
F[31] = mult[59] * table[3];
F[39] = mult[60] * table[4];
F[47] = mult[61] * table[5];
F[55] = mult[62] * table[6];
F[63] = mult[63] * table[7];
#define ADD_SUB( a, b ) \
{ \
@@ -987,262 +969,229 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
}
*/
// Second loop unrolling:
// Iteration 0:
ADD_SUB(F0, F1);
ADD_SUB(F2, F3);
ADD_SUB(F4, F5);
ADD_SUB(F6, F7);
ADD_SUB( F[ 0], F[ 1] );
ADD_SUB( F[ 2], F[ 3] );
ADD_SUB( F[ 4], F[ 5] );
ADD_SUB( F[ 6], F[ 7] );
F[ 3] <<= 4;
F[ 7] <<= 4;
ADD_SUB( F[ 0], F[ 2] );
ADD_SUB( F[ 1], F[ 3] );
ADD_SUB( F[ 4], F[ 6] );
ADD_SUB( F[ 5], F[ 7] );
F[ 5] <<= 2;
F[ 6] <<= 4;
F[ 7] <<= 6;
ADD_SUB( F[ 0], F[ 4] );
ADD_SUB( F[ 1], F[ 5] );
ADD_SUB( F[ 2], F[ 6] );
ADD_SUB( F[ 3], F[ 7] );
F3 <<= 4;
F7 <<= 4;
ADD_SUB(F0, F2);
ADD_SUB(F1, F3);
ADD_SUB(F4, F6);
ADD_SUB(F5, F7);
F5 <<= 2;
F6 <<= 4;
F7 <<= 6;
ADD_SUB(F0, F4);
ADD_SUB(F1, F5);
ADD_SUB(F2, F6);
ADD_SUB(F3, F7);
output[0] = Q_REDUCE(F0);
output[8] = Q_REDUCE(F1);
output[16] = Q_REDUCE(F2);
output[24] = Q_REDUCE(F3);
output[32] = Q_REDUCE(F4);
output[40] = Q_REDUCE(F5);
output[48] = Q_REDUCE(F6);
output[56] = Q_REDUCE(F7);
output[ 0] = Q_REDUCE( F[ 0] );
output[ 8] = Q_REDUCE( F[ 1] );
output[16] = Q_REDUCE( F[ 2] );
output[24] = Q_REDUCE( F[ 3] );
output[32] = Q_REDUCE( F[ 4] );
output[40] = Q_REDUCE( F[ 5] );
output[48] = Q_REDUCE( F[ 6] );
output[56] = Q_REDUCE( F[ 7] );
// Iteration 1:
ADD_SUB(F8, F9);
ADD_SUB(F10, F11);
ADD_SUB(F12, F13);
ADD_SUB(F14, F15);
ADD_SUB( F[ 8], F[ 9] );
ADD_SUB( F[10], F[11] );
ADD_SUB( F[12], F[13] );
ADD_SUB( F[14], F[15] );
F[11] <<= 4;
F[15] <<= 4;
ADD_SUB( F[ 8], F[10] );
ADD_SUB( F[ 9], F[11] );
ADD_SUB( F[12], F[14] );
ADD_SUB( F[13], F[15] );
F[13] <<= 2;
F[14] <<= 4;
F[15] <<= 6;
ADD_SUB( F[ 8], F[12] );
ADD_SUB( F[ 9], F[13] );
ADD_SUB( F[10], F[14] );
ADD_SUB( F[11], F[15] );
F11 <<= 4;
F15 <<= 4;
ADD_SUB(F8, F10);
ADD_SUB(F9, F11);
ADD_SUB(F12, F14);
ADD_SUB(F13, F15);
F13 <<= 2;
F14 <<= 4;
F15 <<= 6;
ADD_SUB(F8, F12);
ADD_SUB(F9, F13);
ADD_SUB(F10, F14);
ADD_SUB(F11, F15);
output[1] = Q_REDUCE(F8);
output[9] = Q_REDUCE(F9);
output[17] = Q_REDUCE(F10);
output[25] = Q_REDUCE(F11);
output[33] = Q_REDUCE(F12);
output[41] = Q_REDUCE(F13);
output[49] = Q_REDUCE(F14);
output[57] = Q_REDUCE(F15);
output[ 1] = Q_REDUCE( F[ 8] );
output[ 9] = Q_REDUCE( F[ 9] );
output[17] = Q_REDUCE( F[10] );
output[25] = Q_REDUCE( F[11] );
output[33] = Q_REDUCE( F[12] );
output[41] = Q_REDUCE( F[13] );
output[49] = Q_REDUCE( F[14] );
output[57] = Q_REDUCE( F[15] );
// Iteration 2:
ADD_SUB(F16, F17);
ADD_SUB(F18, F19);
ADD_SUB(F20, F21);
ADD_SUB(F22, F23);
ADD_SUB( F[16], F[17] );
ADD_SUB( F[18], F[19] );
ADD_SUB( F[20], F[21] );
ADD_SUB( F[22], F[23] );
F[19] <<= 4;
F[23] <<= 4;
ADD_SUB( F[16], F[18]);
ADD_SUB( F[17], F[19]);
ADD_SUB( F[20], F[22]);
ADD_SUB( F[21], F[23]);
F[21] <<= 2;
F[22] <<= 4;
F[23] <<= 6;
ADD_SUB( F[16], F[20] );
ADD_SUB( F[17], F[21] );
ADD_SUB( F[18], F[22] );
ADD_SUB( F[19], F[23] );
F19 <<= 4;
F23 <<= 4;
ADD_SUB(F16, F18);
ADD_SUB(F17, F19);
ADD_SUB(F20, F22);
ADD_SUB(F21, F23);
F21 <<= 2;
F22 <<= 4;
F23 <<= 6;
ADD_SUB(F16, F20);
ADD_SUB(F17, F21);
ADD_SUB(F18, F22);
ADD_SUB(F19, F23);
output[2] = Q_REDUCE(F16);
output[10] = Q_REDUCE(F17);
output[18] = Q_REDUCE(F18);
output[26] = Q_REDUCE(F19);
output[34] = Q_REDUCE(F20);
output[42] = Q_REDUCE(F21);
output[50] = Q_REDUCE(F22);
output[58] = Q_REDUCE(F23);
output[ 2] = Q_REDUCE( F[16] );
output[10] = Q_REDUCE( F[17] );
output[18] = Q_REDUCE( F[18] );
output[26] = Q_REDUCE( F[19] );
output[34] = Q_REDUCE( F[20] );
output[42] = Q_REDUCE( F[21] );
output[50] = Q_REDUCE( F[22] );
output[58] = Q_REDUCE( F[23] );
// Iteration 3:
ADD_SUB(F24, F25);
ADD_SUB(F26, F27);
ADD_SUB(F28, F29);
ADD_SUB(F30, F31);
ADD_SUB( F[24], F[25] );
ADD_SUB( F[26], F[27] );
ADD_SUB( F[28], F[29] );
ADD_SUB( F[30], F[31] );
F[27] <<= 4;
F[31] <<= 4;
ADD_SUB( F[24], F[26] );
ADD_SUB( F[25], F[27] );
ADD_SUB( F[28], F[30] );
ADD_SUB( F[29], F[31] );
F[29] <<= 2;
F[30] <<= 4;
F[31] <<= 6;
ADD_SUB( F[24], F[28] );
ADD_SUB( F[25], F[29] );
ADD_SUB( F[26], F[30] );
ADD_SUB( F[27], F[31] );
F27 <<= 4;
F31 <<= 4;
ADD_SUB(F24, F26);
ADD_SUB(F25, F27);
ADD_SUB(F28, F30);
ADD_SUB(F29, F31);
F29 <<= 2;
F30 <<= 4;
F31 <<= 6;
ADD_SUB(F24, F28);
ADD_SUB(F25, F29);
ADD_SUB(F26, F30);
ADD_SUB(F27, F31);
output[3] = Q_REDUCE(F24);
output[11] = Q_REDUCE(F25);
output[19] = Q_REDUCE(F26);
output[27] = Q_REDUCE(F27);
output[35] = Q_REDUCE(F28);
output[43] = Q_REDUCE(F29);
output[51] = Q_REDUCE(F30);
output[59] = Q_REDUCE(F31);
output[ 3] = Q_REDUCE( F[24] );
output[11] = Q_REDUCE( F[25] );
output[19] = Q_REDUCE( F[26] );
output[27] = Q_REDUCE( F[27] );
output[35] = Q_REDUCE( F[28] );
output[43] = Q_REDUCE( F[29] );
output[51] = Q_REDUCE( F[30] );
output[59] = Q_REDUCE( F[31] );
// Iteration 4:
ADD_SUB(F32, F33);
ADD_SUB(F34, F35);
ADD_SUB(F36, F37);
ADD_SUB(F38, F39);
ADD_SUB( F[32], F[33] );
ADD_SUB( F[34], F[35] );
ADD_SUB( F[36], F[37] );
ADD_SUB( F[38], F[39] );
F[35] <<= 4;
F[39] <<= 4;
ADD_SUB( F[32], F[34] );
ADD_SUB( F[33], F[35] );
ADD_SUB( F[36], F[38] );
ADD_SUB( F[37], F[39] );
F[37] <<= 2;
F[38] <<= 4;
F[39] <<= 6;
ADD_SUB( F[32], F[36] );
ADD_SUB( F[33], F[37] );
ADD_SUB( F[34], F[38] );
ADD_SUB( F[35], F[39] );
F35 <<= 4;
F39 <<= 4;
ADD_SUB(F32, F34);
ADD_SUB(F33, F35);
ADD_SUB(F36, F38);
ADD_SUB(F37, F39);
F37 <<= 2;
F38 <<= 4;
F39 <<= 6;
ADD_SUB(F32, F36);
ADD_SUB(F33, F37);
ADD_SUB(F34, F38);
ADD_SUB(F35, F39);
output[4] = Q_REDUCE(F32);
output[12] = Q_REDUCE(F33);
output[20] = Q_REDUCE(F34);
output[28] = Q_REDUCE(F35);
output[36] = Q_REDUCE(F36);
output[44] = Q_REDUCE(F37);
output[52] = Q_REDUCE(F38);
output[60] = Q_REDUCE(F39);
output[ 4] = Q_REDUCE( F[32] );
output[12] = Q_REDUCE( F[33] );
output[20] = Q_REDUCE( F[34] );
output[28] = Q_REDUCE( F[35] );
output[36] = Q_REDUCE( F[36] );
output[44] = Q_REDUCE( F[37] );
output[52] = Q_REDUCE( F[38] );
output[60] = Q_REDUCE( F[39] );
// Iteration 5:
ADD_SUB(F40, F41);
ADD_SUB(F42, F43);
ADD_SUB(F44, F45);
ADD_SUB(F46, F47);
ADD_SUB( F[40], F[41] );
ADD_SUB( F[42], F[43] );
ADD_SUB( F[44], F[45] );
ADD_SUB( F[46], F[47] );
F[43] <<= 4;
F[47] <<= 4;
ADD_SUB( F[40], F[42] );
ADD_SUB( F[41], F[43] );
ADD_SUB( F[44], F[46] );
ADD_SUB( F[45], F[47] );
F[45] <<= 2;
F[46] <<= 4;
F[47] <<= 6;
ADD_SUB( F[40], F[44] );
ADD_SUB( F[41], F[45] );
ADD_SUB( F[42], F[46] );
ADD_SUB( F[43], F[47] );
F43 <<= 4;
F47 <<= 4;
ADD_SUB(F40, F42);
ADD_SUB(F41, F43);
ADD_SUB(F44, F46);
ADD_SUB(F45, F47);
F45 <<= 2;
F46 <<= 4;
F47 <<= 6;
ADD_SUB(F40, F44);
ADD_SUB(F41, F45);
ADD_SUB(F42, F46);
ADD_SUB(F43, F47);
output[5] = Q_REDUCE(F40);
output[13] = Q_REDUCE(F41);
output[21] = Q_REDUCE(F42);
output[29] = Q_REDUCE(F43);
output[37] = Q_REDUCE(F44);
output[45] = Q_REDUCE(F45);
output[53] = Q_REDUCE(F46);
output[61] = Q_REDUCE(F47);
output[ 5] = Q_REDUCE( F[40] );
output[13] = Q_REDUCE( F[41] );
output[21] = Q_REDUCE( F[42] );
output[29] = Q_REDUCE( F[43] );
output[37] = Q_REDUCE( F[44] );
output[45] = Q_REDUCE( F[45] );
output[53] = Q_REDUCE( F[46] );
output[61] = Q_REDUCE( F[47] );
// Iteration 6:
ADD_SUB(F48, F49);
ADD_SUB(F50, F51);
ADD_SUB(F52, F53);
ADD_SUB(F54, F55);
ADD_SUB( F[48], F[49] );
ADD_SUB( F[50], F[51] );
ADD_SUB( F[52], F[53] );
ADD_SUB( F[54], F[55] );
F[51] <<= 4;
F[55] <<= 4;
ADD_SUB( F[48], F[50] );
ADD_SUB( F[49], F[51] );
ADD_SUB( F[52], F[54] );
ADD_SUB( F[53], F[55] );
F[53] <<= 2;
F[54] <<= 4;
F[55] <<= 6;
ADD_SUB( F[48], F[52] );
ADD_SUB( F[49], F[53] );
ADD_SUB( F[50], F[54] );
ADD_SUB( F[51], F[55] );
F51 <<= 4;
F55 <<= 4;
ADD_SUB(F48, F50);
ADD_SUB(F49, F51);
ADD_SUB(F52, F54);
ADD_SUB(F53, F55);
F53 <<= 2;
F54 <<= 4;
F55 <<= 6;
ADD_SUB(F48, F52);
ADD_SUB(F49, F53);
ADD_SUB(F50, F54);
ADD_SUB(F51, F55);
output[6] = Q_REDUCE(F48);
output[14] = Q_REDUCE(F49);
output[22] = Q_REDUCE(F50);
output[30] = Q_REDUCE(F51);
output[38] = Q_REDUCE(F52);
output[46] = Q_REDUCE(F53);
output[54] = Q_REDUCE(F54);
output[62] = Q_REDUCE(F55);
output[ 6] = Q_REDUCE( F[48] );
output[14] = Q_REDUCE( F[49] );
output[22] = Q_REDUCE( F[50] );
output[30] = Q_REDUCE( F[51] );
output[38] = Q_REDUCE( F[52] );
output[46] = Q_REDUCE( F[53] );
output[54] = Q_REDUCE( F[54] );
output[62] = Q_REDUCE( F[55] );
// Iteration 7:
ADD_SUB(F56, F57);
ADD_SUB(F58, F59);
ADD_SUB(F60, F61);
ADD_SUB(F62, F63);
ADD_SUB( F[56], F[57] );
ADD_SUB( F[58], F[59] );
ADD_SUB( F[60], F[61] );
ADD_SUB( F[62], F[63] );
F[59] <<= 4;
F[63] <<= 4;
ADD_SUB( F[56], F[58] );
ADD_SUB( F[57], F[59] );
ADD_SUB( F[60], F[62] );
ADD_SUB( F[61], F[63] );
F[61] <<= 2;
F[62] <<= 4;
F[63] <<= 6;
ADD_SUB( F[56], F[60] );
ADD_SUB( F[57], F[61] );
ADD_SUB( F[58], F[62] );
ADD_SUB( F[59], F[63] );
F59 <<= 4;
F63 <<= 4;
ADD_SUB(F56, F58);
ADD_SUB(F57, F59);
ADD_SUB(F60, F62);
ADD_SUB(F61, F63);
F61 <<= 2;
F62 <<= 4;
F63 <<= 6;
ADD_SUB(F56, F60);
ADD_SUB(F57, F61);
ADD_SUB(F58, F62);
ADD_SUB(F59, F63);
output[7] = Q_REDUCE(F56);
output[15] = Q_REDUCE(F57);
output[23] = Q_REDUCE(F58);
output[31] = Q_REDUCE(F59);
output[39] = Q_REDUCE(F60);
output[47] = Q_REDUCE(F61);
output[55] = Q_REDUCE(F62);
output[63] = Q_REDUCE(F63);
output[ 7] = Q_REDUCE( F[56] );
output[15] = Q_REDUCE( F[57] );
output[23] = Q_REDUCE( F[58] );
output[31] = Q_REDUCE( F[59] );
output[39] = Q_REDUCE( F[60] );
output[47] = Q_REDUCE( F[61] );
output[55] = Q_REDUCE( F[62] );
output[63] = Q_REDUCE( F[63] );
#undef ADD_SUB
#undef Q_REDUCE

View File

@@ -134,10 +134,10 @@ int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len )
int sha3_4way_final( void *md, sha3_4way_ctx_t *c )
{
c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ],
m256_const1_64( 6 ) );
_mm256_set1_epi64x( 6 ) );
c->st[ c->rsiz / 8 - 1 ] =
_mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ],
m256_const1_64( 0x8000000000000000 ) );
_mm256_set1_epi64x( 0x8000000000000000 ) );
sha3_4way_keccakf( c->st );
memcpy( md, c->st, c->mdlen * 4 );
return 1;
@@ -268,10 +268,10 @@ int sha3_8way_final( void *md, sha3_8way_ctx_t *c )
{
c->st[ c->pt ] =
_mm512_xor_si512( c->st[ c->pt ],
m512_const1_64( 6 ) );
_mm512_set1_epi64( 6 ) );
c->st[ c->rsiz / 8 - 1 ] =
_mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ],
m512_const1_64( 0x8000000000000000 ) );
_mm512_set1_epi64( 0x8000000000000000 ) );
sha3_8way_keccakf( c->st );
memcpy( md, c->st, c->mdlen * 8 );
return 1;

View File

@@ -201,7 +201,7 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m512i eight = m512_const1_64( 8 );
const __m512i eight = _mm512_set1_epi64( 8 );
const bool bench = opt_benchmark;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
@@ -369,7 +369,7 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );

View File

@@ -114,7 +114,7 @@ int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n +=8;
} while ( likely( ( n < last_nonce ) && !( *restart ) ) );
pdata[19] = n;
@@ -218,7 +218,7 @@ int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n +=4;
} while ( likely( ( n < last_nonce ) && !( *restart ) ) );
pdata[19] = n;

View File

@@ -536,7 +536,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -963,7 +963,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;

View File

@@ -49,7 +49,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -102,7 +102,7 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( ( n < last_nonce ) && !(*restart) );
pdata[19] = n;

View File

@@ -26,7 +26,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( opt_debug && !thr_id )
if ( !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, swab32( pdata[17] ), timeHash );
}

View File

@@ -658,7 +658,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -1143,7 +1143,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;

View File

@@ -181,7 +181,7 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -335,7 +335,7 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;

View File

@@ -254,7 +254,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m512i eight = m512_const1_64( 8 );
const __m512i eight = _mm512_set1_epi64( 8 );
const bool bench = opt_benchmark;
// convert LE32 to LE64
@@ -468,7 +468,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
// convert LE32 to LE64

View File

@@ -445,7 +445,7 @@ int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -494,7 +494,7 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -787,7 +787,7 @@ int scanhash_x22i_4way_sha( struct work* work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -835,7 +835,7 @@ int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -571,7 +571,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const bool bench = opt_benchmark;
const __m512i eight = m512_const1_64( 8 );
const __m512i eight = _mm512_set1_epi64( 8 );
if ( bench ) ptarget[7] = 0x08ff;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
@@ -927,7 +927,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x08ff;

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.22.3.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.0.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.22.3'
PACKAGE_STRING='cpuminer-opt 3.22.3'
PACKAGE_VERSION='3.23.0'
PACKAGE_STRING='cpuminer-opt 3.23.0'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.22.3 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.23.0 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.22.3:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.23.0:";;
esac
cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.22.3
cpuminer-opt configure 3.23.0
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.22.3, which was
It was created by cpuminer-opt $as_me 3.23.0, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.22.3'
VERSION='3.23.0'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.22.3, which was
This file was extended by cpuminer-opt $as_me 3.23.0, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 3.22.3
cpuminer-opt config.status 3.23.0
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.22.3])
AC_INIT([cpuminer-opt], [3.23.0])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

7647
configure~ Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -1532,6 +1532,7 @@ const char *getwork_req =
#define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
#define GBT_RULES "[\"segwit\"]"
static const char *gbt_req =
"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
GBT_CAPABILITIES ", \"rules\": " GBT_RULES "}], \"id\":0}\r\n";
@@ -1589,18 +1590,21 @@ start:
json_decref( val );
goto start;
}
allow_getwork = false; // GBT is working, disable fallback
}
else
rc = work_decode( json_object_get( val, "result" ), work );
if ( rc )
{
bool new_work = true;
json_decref( val );
get_mininginfo( curl, work );
report_summary_log( false );
if ( opt_protocol | opt_debug )
if ( opt_protocol || opt_debug )
{
timeval_subtract( &diff, &tv_end, &tv_start );
applog( LOG_INFO, "%s new work received in %.2f ms",
@@ -1621,8 +1625,10 @@ start:
applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
work->height, work->tx_count, net_diff,
work->data[ algo_gate.ntime_index ] );
if ( !opt_quiet )
else
new_work = false;
if ( new_work && !opt_quiet )
{
double miner_hr = 0.;
double net_hr = net_hashrate;
@@ -2745,10 +2751,14 @@ static void *stratum_thread(void *userdata )
}
else
{
stratum_down = false;
// sometimes stratum connects but doesn't immediately send a job, wait for one.
// stratum_down = false;
applog(LOG_BLUE,"Stratum connection established" );
if ( stratum.new_job ) // prime first job
{
stratum_down = false;
stratum_gen_work( &stratum, &g_work );
}
}
}
@@ -2757,6 +2767,7 @@ static void *stratum_thread(void *userdata )
{
if ( likely( s = stratum_recv_line( &stratum ) ) )
{
stratum_down = false;
if ( likely( !stratum_handle_method( &stratum, s ) ) )
stratum_handle_response( s );
free( s );
@@ -2848,6 +2859,7 @@ static bool cpu_capability( bool display_only )
bool cpu_has_sha = has_sha();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_vaes = has_vaes();
bool cpu_has_avx10 = has_avx10();
bool sw_has_aes = false;
bool sw_has_sse2 = false;
bool sw_has_sse42 = false;
@@ -2912,8 +2924,8 @@ static bool cpu_capability( bool display_only )
#ifdef _MSC_VER
" with VC++ 2013\n");
#elif defined(__GNUC__)
" with GCC");
printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
" with GCC-");
printf("%d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
#else
printf("\n");
#endif
@@ -2927,6 +2939,8 @@ static bool cpu_capability( bool display_only )
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha ) printf( " SHA" );
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d",
avx10_version(), avx10_vector_length() );
printf("\nSW features: ");
if ( sw_has_avx512 ) printf( " AVX512" );

View File

@@ -15,10 +15,6 @@
// data but not for vectors. The main categories are bit rotation
// and endian byte swapping
//
// An attempt was made to make the names as similar as possible to
// Intel's intrinsic function format. Most variations are to avoid
// confusion with actual Intel intrinsics, brevity, and clarity.
//
// This suite supports some operations on regular 64 bit integers
// as well as 128 bit integers available on recent versions of Linux
// and GCC.
@@ -37,6 +33,9 @@
// SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2.
// AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen)
// AVX512: 512 bit vectors (Starting with SkylakeX)
// AVX10: when available will supersede AVX512 and will bring AVX512
// features, except 512 bit vectors, to Intel's Ecores. It needs to be
// enabled manually when the relevant GCC macros are known.
//
// Most functions are avalaible at the stated levels but in rare cases
// a higher level feature may be required with no compatible alternative.
@@ -53,21 +52,17 @@
// for the applications but also adds responsibility to ensure adequate data
// alignment.
//
// Windows has problems with function vector arguments larger than
// 128 bits. Stack alignment is only guaranteed to 16 bytes. Always use
// pointers for larger vectors in function arguments. Macros can be used
// for larger value arguments.
//
// An attempt was made to make the names as similar as possible to
// Intel's intrinsic function format. Most variations are to avoid
// confusion with actual Intel intrinsics, brevity, and clarity
// confusion with actual Intel intrinsics, brevity, and clarity.
//
// The main differences are:
//
// - the leading underscore(s) "_" and the "i" are dropped from the
// prefix of vector instructions.
// - "mm64" and "mm128" used for 64 and 128 bit prefix respectively
// to avoid the ambiguity of "mm".
// - the leading underscore "_" is dropped from the prefix of vector function
// macros.
// - "mm128" is used 128 bit prefix to be consistent with mm256 & mm512 and
// to avoid the ambiguity of "mm" which is also used for 64 bit MMX
// intrinsics.
// - the element size does not include additional type specifiers
// like "epi".
// - there is a subset of some functions for scalar data. They may have
@@ -76,14 +71,14 @@
//
// Function names follow this pattern:
//
// prefix_op[vsize]_[esize]
// [prefix]_[op][vsize]_[esize]
//
// Prefix: usually the size of the returned vector.
// Following are some examples:
//
// u64: unsigned 64 bit integer function
// i128: signed 128 bit integer function (rarely used)
// m128: 128 bit vector identifier
// m128: 128 bit vector identifier (deprecated)
// mm128: 128 bit vector function
//
// op: describes the operation of the function or names the data
@@ -94,7 +89,7 @@
// vsize: optional, lane size used when a function operates on elements
// within lanes of a larger vector.
//
// mm256_shuflr128_32 rotates each 128 bit lane of a 256 bit vector
// Ex: mm256_shuflr128_32 rotates each 128 bit lane of a 256 bit vector
// right by 32 bits.
//
// Vector constants

View File

@@ -731,6 +731,67 @@ static inline void extr_lane_8x32( void *d, const void *s,
#if defined(__AVX2__)
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
//TODO Enable for AVX10_256 AVX10_512
// Combine byte swap & broadcast in one permute
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
const __m128i s3 = casti_m128i( src,3 );
const __m128i s4 = casti_m128i( src,4 );
casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 1 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 2 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 3 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d, 4 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 5 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 6 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 7 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d, 8 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d, 9 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,10 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,11 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,12 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,13 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,14 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,15 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,16 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,17 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,18 ) = _mm256_permutexvar_epi8( c2,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,19 ) = _mm256_permutexvar_epi8( c3,
_mm256_castsi128_si256( s4 ) );
}
#else
static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
@@ -792,6 +853,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
_mm256_castsi128_si256( s4 ), c3 );
}
#endif // AVX512VBMI else
#endif // AVX2
// 16x32
@@ -1173,10 +1235,12 @@ static inline void extr_lane_16x32( void *d, const void *s,
((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
}
#if defined(__AVX512F__) && defined(__AVX512VL__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__AVX512VBMI__)
// TODO Enable for AVX10_512
// Combine byte swap & broadcast in one permute
static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
{
@@ -1496,10 +1560,48 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
_mm256_castsi128_si256( s4 ), 0x55 );
}
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
//TODO Enable for AVX10_256 AVX10_512
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
const __m128i s3 = casti_m128i( src,3 );
const __m128i s4 = casti_m128i( src,4 );
casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d,1 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s0 ) );
casti_m256i( d,2 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d,3 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s1 ) );
casti_m256i( d,4 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,5 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s2 ) );
casti_m256i( d,6 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,7 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s3 ) );
casti_m256i( d,8 ) = _mm256_permutexvar_epi8( c0,
_mm256_castsi128_si256( s4 ) );
casti_m256i( d,9 ) = _mm256_permutexvar_epi8( c1,
_mm256_castsi128_si256( s4 ) );
}
#else
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
const __m256i bswap_shuf = mm256_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
__m256i s0 = casti_m256i( src,0 );
__m256i s1 = casti_m256i( src,1 );
__m128i s4 = casti_m128i( src,4 );
@@ -1524,6 +1626,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
_mm256_castsi128_si256( s4 ), 0x55 );
}
#endif
#endif // AVX2
// 8x64 (AVX512)
@@ -1846,6 +1950,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
#if defined(__AVX512F__) && defined(__AVX512VL__)
//TODO Enable for AVX10_512
// broadcast to all lanes
static inline void mm512_intrlv80_8x64( void *dst, const void *src )
{
@@ -2089,10 +2195,36 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
d0[3] = s[12]; d1[3] = s[13]; d2[3] = s[14]; d3[3] = s[15];
}
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
#if defined(__AVX512VBMI__)
//TODO Enable for AVX10_512
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m512i bswap_shuf = mm512_bcast_m128(
_mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
const __m128i s0 = casti_m128i( src,0 );
const __m128i s1 = casti_m128i( src,1 );
const __m128i s2 = casti_m128i( src,2 );
const __m128i s3 = casti_m128i( src,3 );
const __m128i s4 = casti_m128i( src,4 );
casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
bswap_shuf );
casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
bswap_shuf );
casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
bswap_shuf );
casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
bswap_shuf );
casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
bswap_shuf );
}
#else
static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
@@ -2108,14 +2240,15 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
s3 = _mm_shuffle_epi8( s3, bswap_shuf );
s4 = _mm_shuffle_epi8( s4, bswap_shuf );
casti_m512i( d, 0 ) = mm512_bcast_m128( s0 );
casti_m512i( d, 1 ) = mm512_bcast_m128( s1 );
casti_m512i( d, 2 ) = mm512_bcast_m128( s2 );
casti_m512i( d, 3 ) = mm512_bcast_m128( s3 );
casti_m512i( d, 4 ) = mm512_bcast_m128( s4 );
}
casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
}
#endif
#endif // AVX512VBMI ELSE
#endif // AVX512
// 2x256 (AVX512)
@@ -2955,6 +3088,8 @@ do { \
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
//TODO Enable for AVX10_512
/*
#define mm512_intrlv_blend_128( hi, lo ) \
_mm512_mask_blend_epi32( 0x0f0f, hi, lo )

View File

@@ -43,9 +43,11 @@ typedef union
} __attribute__ ((aligned (16))) m128_ovly;
// Deprecated. EVEX adds support for integer argument in broadcast instruction
// eliminating the need for an explicit move in most cases. Use the set1
// intrinsic with integers and let the compiler figure it out.
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
// that make these functions either unnecessary or inefficient.
// In cases where an explicit move betweeen GP & SIMD registers is still
// necessary the cvt, set, or set1 intrinsics can be used allowing the
// compiler to exploilt new features to produce optimum code.
static inline __m128i mm128_mov64_128( const uint64_t n )
{
__m128i a;
@@ -73,15 +75,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
//#define mm128_bcast_m64( v ) _mm_shuffle_epi32( v, 0x44 )
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
// Deprecated, use set1 directly
#define m128_const1_64 _mm_set1_epi64x
#define m128_const1_32 _mm_set1_epi32
// Deprecated, use set directly
#define m128_const_64 _mm_set_epi64x
// Pseudo constants
#define m128_zero _mm_setzero_si128()
#define m128_one_128 mm128_mov64_128( 1 )
//#define m128_one_64 _mm_set1_epi64x( 1 )
@@ -141,7 +135,7 @@ static inline __m128i mm128_neg1_fn()
// Examples of simple operations using xim:
// Insert 32 bit integer into v at element c and return updated v.
// Copy i to element c of dest and copy remaining elemnts from v.
static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
const int c )
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -161,6 +155,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
// Bitwise not (~v)
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
static inline __m128i mm128_not( const __m128i v )
{ return _mm_ternarylogic_epi64( v, v, v, 1 ); }
@@ -223,18 +218,54 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
// a ^ b ^ c
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 )
// a & b & c
#define mm128_and3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x80 )
// a | b | c
#define mm128_or3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xfe )
// a ^ ( b & c )
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 )
// a & ( b ^ c )
#define mm128_andxor( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x60 )
// a ^ ( b | c )
#define mm128_xoror( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x1e )
// a ^ ( ~b & c )
#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
// a | ( b & c )
#define mm128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define mm128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define mm128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define mm128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define mm128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define mm128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
#define mm128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define mm128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#endif
@@ -257,6 +288,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// transparency.
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm128_ror_64 _mm_ror_epi64
#define mm128_rol_64 _mm_rol_epi64
@@ -372,7 +404,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__) && !defined(__AVX512VL__)
//TODO Enable for AVX10_256
#if defined(__AVX512VL__)
#define m1286_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
#elif defined(__SSSE3__)
#define mm128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
@@ -380,7 +415,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
#endif
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#if defined(__AVX512VL__)
#define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 )
#elif defined(__SSSE3__)
#define mm128_shuflr64_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
@@ -390,7 +427,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
// Rotate 32 bit lanes
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#if defined(__AVX512VL__)
#define mm128_swap32_16( v ) _mm_ror_epi32( v, 16 )
#elif defined(__SSSE3__)
#define mm128_swap32_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
@@ -400,7 +439,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#if defined(__AVX512VL__)
#define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 )
#elif defined(__SSSE3__)
#define mm128_shuflr32_8( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
0x0c0f0e0d080b0a09, 0x0407060500030201 ) )

View File

@@ -13,17 +13,14 @@
// automatically but their use is limited because 256 bit vectors are less
// likely to be used when 512 is available.
//
// AVX10_256 will support AVX512VL instructions on CPUs limited to 256 bit
// vectors. This will require enabling when the compiler's AVX10 feature
// macros are known.
//
// "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
// lanes and data can't cross the 128 bit lane boundary.
// Full width byte shuffle is available with AVX512VL using the mask version
// with a full mask (-1).
// Instructions that can move data across 128 bit lane boundary incur a
// performance penalty over those that can't.
// Some usage of index vectors may be encoded as if full vector shuffles are
// supported. This has no side effects and would have the same results using
// either version.
// If the need arises and AVX512VL is available, 256 bit full vector byte
// shuffles can be implemented using the AVX512 mask feature with a NULL mask.
#if defined(__AVX__)
@@ -66,6 +63,7 @@ typedef union
// Set either the low or high 64 bit elements in 128 bit lanes, other elements
// are set to zero.
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm256_bcast128lo_64( i64 ) _mm256_maskz_set1_epi64( 0x55, i64 )
#define mm256_bcast128hi_64( i64 ) _mm256_maskz_set1_epi64( 0xaa, i64 )
@@ -81,11 +79,9 @@ typedef union
#define mm256_set2_64( i1, i0 ) mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )
// Deprecated
#define m256_const1_64 _mm256_set1_epi64x
#define m256_const1_32 _mm256_set1_epi32
#define mm256_set4_32( i3, i2, i1, i0 ) \
mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )
//
// All SIMD constant macros are actually functions containing executable
// code and therefore can't be used as compile time initializers.
@@ -121,6 +117,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Basic operations without SIMD equivalent
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
static inline __m256i mm256_not( const __m256i v )
{ return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
@@ -140,8 +137,7 @@ static inline __m256i mm256_not( const __m256i v )
_mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )
#if defined(__AVX512VL__)
// AVX512 has ternary logic that supports any 3 input boolean expression.
//TODO Enable for AVX10_256
// a ^ b ^ c
#define mm256_xor3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x96 )
@@ -176,31 +172,31 @@ static inline __m256i mm256_not( const __m256i v )
#else
#define mm256_xor3( a, b, c ) \
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
#define mm256_xor4( a, b, c, d ) \
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
#define mm256_and3( a, b, c ) \
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
#define mm256_or3( a, b, c ) \
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
#define mm256_xorand( a, b, c ) \
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
#define mm256_andxor( a, b, c ) \
_mm256_and_si256( a, _mm256_xor_si256( b, c ))
#define mm256_xoror( a, b, c ) \
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
_mm256_xor_si256( a, _mm256_or_si256( b, c ) )
#define mm256_xorandnot( a, b, c ) \
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
_mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
#define mm256_orand( a, b, c ) \
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
#define mm256_xnor( a, b ) \
mm256_not( _mm256_xor_si256( a, b ) )
@@ -226,6 +222,7 @@ static inline __m256i mm256_not( const __m256i v )
// transparency.
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define mm256_ror_64 _mm256_ror_epi64
#define mm256_rol_64 _mm256_rol_epi64
@@ -380,6 +377,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
//TODO Enable for AVX10_256
#if defined(__AVX512VL__)
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
#else

View File

@@ -113,10 +113,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
#define mm512_set2_64( i1, i0 ) \
mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )
// Deprecated, use set
#define m512_const1_64 _mm512_set1_epi64
#define m512_const1_32 _mm512_set1_epi32
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
// Deprecated

View File

@@ -174,35 +174,147 @@ static inline int cpu_fanpercent()
return 0;
}
// CPUID
// This list is incomplete, it only contains features of interest to cpuminer.
// refer to http://en.wikipedia.org/wiki/CPUID for details.
// AVX10 compatibility notes
//
// Notation used: AVX10i.[version]_[vectorwidth]
// AVX10.1_512 is a rebranding of AVX512 and is effectively the AVX* superset
// with full 512 bit vector support.
// AVX10.2_256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
// features applied only to 256 bit and 128 bit vectors.
// Future AVX10 versions will add new instructions and features.
// Register array indexes
#define EAX_Reg (0)
#define EBX_Reg (1)
#define ECX_Reg (2)
#define EDX_Reg (3)
// CPUID function number, aka leaf (EAX)
#define VENDOR_ID (0)
#define CPU_INFO (1)
#define CACHE_TLB_DESCRIPTOR (2)
#define EXTENDED_FEATURES (7)
#define AVX10_FEATURES (0x24)
#define HIGHEST_EXT_FUNCTION (0x80000000)
#define EXTENDED_CPU_INFO (0x80000001)
#define CPU_BRAND_1 (0x80000002)
#define CPU_BRAND_2 (0x80000003)
#define CPU_BRAND_3 (0x80000004)
// CPU_INFO: EAX=1, ECX=0
// ECX
#define SSE3_Flag 1
#define SSSE3_Flag (1<< 9)
#define XOP_Flag (1<<11) // obsolete
#define FMA3_Flag (1<<12)
#define SSE41_Flag (1<<19)
#define SSE42_Flag (1<<20)
#define AES_NI_Flag (1<<25)
#define XSAVE_Flag (1<<26)
#define OSXSAVE_Flag (1<<27)
#define AVX_Flag (1<<28)
// EDX
#define MMX_Flag (1<<23)
#define SSE_Flag (1<<25)
#define SSE2_Flag (1<<26)
// EXTENDED_FEATURES subleaf 0: EAX=7, ECX=0
// EBX
#define AVX2_Flag (1<< 5)
#define AVX512_F_Flag (1<<16)
#define AVX512_DQ_Flag (1<<17)
#define AVX512_IFMA_Flag (1<<21)
#define AVX512_PF_Flag (1<<26)
#define AVX512_ER_Flag (1<<27)
#define AVX512_CD_Flag (1<<28)
#define SHA_Flag (1<<29)
#define AVX512_BW_Flag (1<<30)
#define AVX512_VL_Flag (1<<31)
// ECX
#define AVX512_VBMI_Flag (1<< 1)
#define AVX512_VBMI2_Flag (1<< 6)
#define VAES_Flag (1<< 9)
#define AVX512_VNNI_Flag (1<<11)
#define AVX512_BITALG_Flag (1<<12)
#define AVX512_VPOPCNTDQ_Flag (1<<14)
// EDX
#define AVX512_4VNNIW_Flag (1<< 2)
#define AVX512_4FMAPS_Flag (1<< 3)
#define AVX512_VP2INTERSECT_Flag (1<< 8)
#define AMX_BF16_Flag (1<<22)
#define AVX512_FP16_Flag (1<<23)
#define AMX_TILE_Flag (1<<24)
#define AMX_INT8_Flag (1<<25)
// EXTENDED_FEATURES subleaf 1: EAX=7, ECX=1
// EAX
#define SHA512_Flag 1
#define SM3_Flag (1<< 1)
#define SM4_Flag (1<< 2)
#define AVX_VNNI_Flag (1<< 4)
#define AVX512_BF16_Flag (1<< 5)
#define AMX_FP16_Flag (1<<21)
#define AVX_IFMA_Flag (1<<23)
// EDX
#define AVX_VNNI_INT8_Flag (1<< 4)
#define AVX_NE_CONVERT_Flag (1<< 5)
#define AMX_COMPLEX_Flag (1<< 8)
#define AVX_VNNI_INT16_Flag (1<<10)
#define AVX10_Flag (1<<19)
#define APX_F_Flag (1<<21)
// AVX10_FEATURES: EAX=0x24, ECX=0
// EBX
#define AVX10_VERSION_mask 0xff // bits [7:0]
#define AVX10_128_Flag (1<<16)
#define AVX10_256_Flag (1<<17)
#define AVX10_512_Flag (1<<18)
// Use this to detect presence of feature
#define AVX_mask (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
#define FMA3_mask (FMA3_Flag|AVX_mask)
#define AVX512_mask (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)
#ifndef __arm__
static inline void cpuid(int functionnumber, int output[4]) {
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
unsigned int output[4] )
{
#if defined (_MSC_VER) || defined (__INTEL_COMPILER)
// Microsoft or Intel compiler, intrin.h included
__cpuidex(output, functionnumber, 0);
// Microsoft or Intel compiler, intrin.h included
__cpuidex(output, leaf, subleaf );
#elif defined(__GNUC__) || defined(__clang__)
// use inline assembly, Gnu/AT&T syntax
int a, b, c, d;
asm volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(0));
output[0] = a;
output[1] = b;
output[2] = c;
output[3] = d;
// use inline assembly, Gnu/AT&T syntax
unsigned int a, b, c, d;
asm volatile( "cpuid"
: "=a"(a), "=b"(b), "=c"(c), "=d"(d)
: "a"(leaf), "c"(subleaf) );
output[ EAX_Reg ] = a;
output[ EBX_Reg ] = b;
output[ ECX_Reg ] = c;
output[ EDX_Reg ] = d;
#else
// unknown platform. try inline assembly with masm/intel syntax
__asm {
mov eax, functionnumber
xor ecx, ecx
cpuid;
mov esi, output
mov[esi], eax
mov[esi + 4], ebx
mov[esi + 8], ecx
mov[esi + 12], edx
}
// unknown platform. try inline assembly with masm/intel syntax
__asm {
mov eax, leaf
mov ecx, subleaf
cpuid;
mov esi, output
mov[esi], eax
mov[esi + 4], ebx
mov[esi + 8], ecx
mov[esi + 12], edx
}
#endif
}
#else /* !__arm__ */
#define cpuid(fn, out) out[0] = 0;
#define cpuid(leaf, subleaf, out) out[0] = 0;
#endif
static inline void cpu_getname(char *outbuf, size_t maxsz)
@@ -211,13 +323,13 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
#ifdef WIN32
char brand[256] = { 0 };
int output[4] = { 0 }, ext;
cpuid(0x80000000, output);
cpuid( 0x80000000, 0, output );
ext = output[0];
if (ext >= 0x80000004)
{
for (int i = 2; i <= (ext & 0xF); i++)
{
cpuid(0x80000000+i, output);
cpuid( 0x80000000+i, 0, output);
memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
}
snprintf(outbuf, maxsz, "%s", brand);
@@ -309,70 +421,97 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
#endif
}
// http://en.wikipedia.org/wiki/CPUID
// Typical display format: AVX10.[version]_[vectorlength], if vector length is
// omitted 256 is the default.
// Ex: AVX10.1_512
// Flags:
// AVX10 128 256 512
// 0 0 0 0 = AVX10 not supported
// 1 1 1 0 = AVX10 256 bit max (version 2)
// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids)
// Other combinations are not defined.
// CPUID commands
#define VENDOR_ID (0)
#define CPU_INFO (1)
#define CACHE_TLB_DESCRIPTOR (2)
#define EXTENDED_FEATURES (7)
#define HIGHEST_EXT_FUNCTION (0x80000000)
#define EXTENDED_CPU_INFO (0x80000001)
#define CPU_BRAND_1 (0x80000002)
#define CPU_BRAND_2 (0x80000003)
#define CPU_BRAND_3 (0x80000004)
// Test AVX10_flag before AVX10_FEATURES flags.
static inline bool has_avx10()
{
#ifdef __arm__
return false;
#else
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
return cpu_info[ EDX_Reg ] & AVX10_Flag;
#endif
}
// Registers
#define EAX_Reg (0)
#define EBX_Reg (1)
#define ECX_Reg (2)
#define EDX_Reg (3)
static inline unsigned int avx10_version()
{
#ifdef __arm__
return 0;
#else
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
}
return 0;
#endif
}
// Feature flags
static inline bool has_avx10_512()
{
#ifdef __arm__
return false;
#else
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
}
return false;
#endif
}
// CPU_INFO ECX
#define SSE3_Flag 1
#define SSSE3_Flag (1<< 9)
#define XOP_Flag (1<<11) // obsolete, only available on pre-Ryzen AMD
#define FMA3_Flag (1<<12)
#define AES_Flag (1<<25)
#define SSE41_Flag (1<<19)
#define SSE42_Flag (1<<20)
#define AES_Flag (1<<25)
#define XSAVE_Flag (1<<26)
#define OSXSAVE_Flag (1<<27)
#define AVX_Flag (1<<28)
static inline bool has_avx10_256()
{
#ifdef __arm__
return false;
#else
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
}
return false;
#endif
}
// CPU_INFO EDX
#define SSE_Flag (1<<25)
#define SSE2_Flag (1<<26)
// EXTENDED_FEATURES EBX
#define AVX2_Flag (1<< 5)
#define AVX512F_Flag (1<<16)
#define AVX512DQ_Flag (1<<17)
#define SHA_Flag (1<<29)
#define AVX512BW_Flag (1<<30)
#define AVX512VL_Flag (1<<31)
// EXTENDED_FEATURES ECX
#define AVX512VBMI_Flag (1<<1)
#define AVX512VBMI2_Flag (1<<6)
#define VAES_Flag (1<<9)
// Use this to detect presence of feature
#define AVX_mask (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
#define FMA3_mask (FMA3_Flag|AVX_mask)
#define AVX512_mask (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag)
// Maximum vector length
static inline unsigned int avx10_vector_length()
{
#ifdef __arm__
return 0;
#else
if ( has_avx10() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( AVX10_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
: ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
}
return 0;
#endif
}
static inline bool has_sha()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & SHA_Flag;
#endif
}
@@ -382,8 +521,8 @@ static inline bool has_sse2()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ EDX_Reg ] & SSE2_Flag;
#endif
}
@@ -394,9 +533,9 @@ static inline bool has_aes_ni()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
return cpu_info[ ECX_Reg ] & AES_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AES_NI_Flag;
#endif
}
@@ -406,8 +545,8 @@ static inline bool has_avx()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask );
#endif
}
@@ -418,8 +557,8 @@ static inline bool has_avx2()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX2_Flag;
#endif
}
@@ -429,9 +568,9 @@ static inline bool has_avx512f()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512F_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_F_Flag;
#endif
}
@@ -440,9 +579,9 @@ static inline bool has_avx512dq()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512DQ_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag;
#endif
}
@@ -451,9 +590,9 @@ static inline bool has_avx512bw()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512BW_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_BW_Flag;
#endif
}
@@ -462,9 +601,9 @@ static inline bool has_avx512vl()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512VL_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & AVX512_VL_Flag;
#endif
}
@@ -474,30 +613,19 @@ static inline bool has_avx512()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask );
#endif
}
// AMD Zen3 added support for 256 bit VAES without requiring AVX512.
// The original Intel spec requires AVX512F to support 512 bit VAES and
// requires AVX512VL to support 256 bit VAES.
// The CPUID VAES bit alone can't distiguish 256 vs 512 bit.
// If necessary:
// VAES 256 & 512 = VAES && AVX512VL
// VAES 512 = VAES && AVX512F
// VAES 256 = ( VAES && AVX512VL ) || ( VAES && !AVX512F )
// VAES 512 only = VAES && AVX512F && !AVX512VL
// VAES 256 only = VAES && !AVX512F
static inline bool has_vaes()
{
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & VAES_Flag;
#endif
}
@@ -507,9 +635,9 @@ static inline bool has_vbmi()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
#endif
}
@@ -518,9 +646,9 @@ static inline bool has_vbmi2()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
#endif
}
@@ -530,8 +658,8 @@ static inline bool has_xop()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( EXTENDED_CPU_INFO, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & XOP_Flag;
#endif
}
@@ -541,8 +669,8 @@ static inline bool has_fma3()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask );
#endif
}
@@ -552,8 +680,8 @@ static inline bool has_sse42()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ ECX_Reg ] & SSE42_Flag;
#endif
}
@@ -563,16 +691,16 @@ static inline bool has_sse()
#ifdef __arm__
return false;
#else
int cpu_info[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
unsigned int cpu_info[4] = { 0 };
cpuid( CPU_INFO, 0, cpu_info );
return cpu_info[ EDX_Reg ] & SSE_Flag;
#endif
}
static inline uint32_t cpuid_get_highest_function_number()
{
uint32_t cpu_info[4] = {0};
cpuid( VENDOR_ID, cpu_info);
unsigned int cpu_info[4] = {0};
cpuid( VENDOR_ID, 0, cpu_info);
return cpu_info[ EAX_Reg ];
}
@@ -605,8 +733,8 @@ static inline void cpu_bestfeature(char *outbuf, size_t maxsz)
#else
int cpu_info[4] = { 0 };
int cpu_info_adv[4] = { 0 };
cpuid( CPU_INFO, cpu_info );
cpuid( EXTENDED_FEATURES, cpu_info_adv );
cpuid( CPU_INFO, 0, cpu_info );
cpuid( EXTENDED_FEATURES, 0, cpu_info_adv );
if ( has_avx() && has_avx2() )
sprintf(outbuf, "AVX2");
@@ -634,14 +762,14 @@ static inline void cpu_brand_string( char* s )
sprintf( s, "ARM" );
#else
int cpu_info[4] = { 0 };
cpuid( VENDOR_ID, cpu_info );
cpuid( VENDOR_ID, 0, cpu_info );
if ( cpu_info[ EAX_Reg ] >= 4 )
{
cpuid( CPU_BRAND_1, cpu_info );
cpuid( CPU_BRAND_1, 0, cpu_info );
memcpy( s, cpu_info, sizeof(cpu_info) );
cpuid( CPU_BRAND_2, cpu_info );
cpuid( CPU_BRAND_2, 0, cpu_info );
memcpy( s + 16, cpu_info, sizeof(cpu_info) );
cpuid( CPU_BRAND_3, cpu_info );
cpuid( CPU_BRAND_3, 0, cpu_info );
memcpy( s + 32, cpu_info, sizeof(cpu_info) );
}
#endif