This commit is contained in:
Jay D Dee
2023-08-30 20:15:48 -04:00
parent 57a6b7b58b
commit 4378d2f841
72 changed files with 10184 additions and 2182 deletions

View File

@@ -308,7 +308,52 @@ static const sph_u32 CS[16] = {
/////////////////////////////////////////
//
// Blake-256 1 way SIMD
// Only used for prehash, otherwise 4way is used with SSE2.
// optimize shuffles to reduce latency caused by dependencies on V1.
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
CSx( r, 5 ) ^ Mx( r, 4 ), \
CSx( r, 3 ) ^ Mx( r, 2 ), \
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
CSx( r, 4 ) ^ Mx( r, 5 ), \
CSx( r, 2 ) ^ Mx( r, 3 ), \
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shufll_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shuflr_32( V2 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
CSx( r, B ) ^ Mx( r, A ), \
CSx( r, 9 ) ^ Mx( r, 8 ), \
CSx( r, F ) ^ Mx( r, E ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
CSx( r, A ) ^ Mx( r, B ), \
CSx( r, 8 ) ^ Mx( r, 9 ), \
CSx( r, E ) ^ Mx( r, F ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shuflr_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shufll_32( V2 ); \
}
/*
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
@@ -350,6 +395,7 @@ static const sph_u32 CS[16] = {
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shufll_32( V1 ); \
}
*/
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 )

View File

@@ -252,14 +252,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
v[12] = m512_const1_64( 0x510E527FADE682D1 );
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
{
size_t i;
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;
@@ -419,14 +419,14 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
v[ 5] = ctx->h[5];
v[ 6] = ctx->h[6];
v[ 7] = ctx->h[7];
v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
v[12] = m256_const1_64( 0x510E527FADE682D1 );
v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
{
size_t i;
ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );
ctx->t[0] = 0;
ctx->t[1] = 0;

View File

@@ -62,14 +62,14 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
memset( S, 0, sizeof( blake2s_4way_state ) );
S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
m128_const1_64( 0x510E527F510E527FULL ) );
_mm_set1_epi64x( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
m128_const1_64( 0x9B05688C9B05688CULL ) );
_mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
_mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
_mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
@@ -269,21 +269,21 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
memcpy_256( m, block, 16 );
memcpy_256( v, S->h, 8 );
v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
m256_const1_64( 0x510E527F510E527FULL ) );
_mm256_set1_epi64x( 0x510E527F510E527FULL ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
m256_const1_64( 0x9B05688C9B05688CULL ) );
_mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
_mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
_mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );
/*
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
@@ -391,14 +391,14 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_8way_state ) );
S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
// for( int i = 0; i < 8; ++i )
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
memcpy_512( m, block, 16 );
memcpy_512( v, S->h, 8 );
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
m512_const1_64( 0x510E527F510E527FULL ) );
_mm512_set1_epi64( 0x510E527F510E527FULL ) );
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
m512_const1_64( 0x9B05688C9B05688CULL ) );
_mm512_set1_epi64( 0x9B05688C9B05688CULL ) );
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
_mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
_mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );
#define G16W( sigma0, sigma1, a, b, c, d) \
@@ -589,14 +589,14 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_16way_state ) );
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );
uint32_t *p = ( uint32_t * )( P );

View File

@@ -64,6 +64,22 @@
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
}
// Pivot about V[1] instead of V[0] reduces latency.
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
V[0] = mm256_shufll_64( V[0] ); \
V[3] = mm256_swap_128( V[3] ); \
V[2] = mm256_shuflr_64( V[2] ); \
BLAKE2B_G( 14, 15, 8, 9, 10, 11, 12, 13 ); \
V[0] = mm256_shuflr_64( V[0] ); \
V[3] = mm256_swap_128( V[3] ); \
V[2] = mm256_shufll_64( V[2] ); \
}
/*
#define BLAKE2B_ROUND( R ) \
{ \
__m256i *V = (__m256i*)v; \
@@ -77,6 +93,7 @@
V[2] = mm256_swap_128( V[2] ); \
V[1] = mm256_shufll_64( V[1] ); \
}
*/
#elif defined(__SSE2__)
// always true

View File

@@ -451,22 +451,22 @@ static const __m128i final_s[16] =
*/
void bmw256_4way_init( bmw256_4way_context *ctx )
{
ctx->H[ 0] = m128_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m128_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m128_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m128_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m128_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m128_const1_64( 0x6465666764656667 );
ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m128_const1_64( 0x7071727370717273 );
ctx->H[13] = m128_const1_64( 0x7475767774757677 );
ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
// for ( int i = 0; i < 16; i++ )
@@ -529,7 +529,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
ptr += 4;
h = sc->H;
@@ -959,22 +959,22 @@ static const __m256i final_s8[16] =
void bmw256_8way_init( bmw256_8way_context *ctx )
{
ctx->H[ 0] = m256_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m256_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m256_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m256_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m256_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m256_const1_64( 0x6465666764656667 );
ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m256_const1_64( 0x7071727370717273 );
ctx->H[13] = m256_const1_64( 0x7475767774757677 );
ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = _mm256_set1_epi64x( 0x4041424340414243 );
ctx->H[ 1] = _mm256_set1_epi64x( 0x4445464744454647 );
ctx->H[ 2] = _mm256_set1_epi64x( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm256_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm256_set1_epi64x( 0x5051525350515253 );
ctx->H[ 5] = _mm256_set1_epi64x( 0x5455565754555657 );
ctx->H[ 6] = _mm256_set1_epi64x( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm256_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm256_set1_epi64x( 0x6061626360616263 );
ctx->H[ 9] = _mm256_set1_epi64x( 0x6465666764656667 );
ctx->H[10] = _mm256_set1_epi64x( 0x68696A6B68696A6B );
ctx->H[11] = _mm256_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm256_set1_epi64x( 0x7071727370717273 );
ctx->H[13] = _mm256_set1_epi64x( 0x7475767774757677 );
ctx->H[14] = _mm256_set1_epi64x( 0x78797A7B78797A7B );
ctx->H[15] = _mm256_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -1030,7 +1030,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
ptr += 4;
h = ctx->H;
@@ -1460,22 +1460,22 @@ static const __m512i final_s16[16] =
void bmw256_16way_init( bmw256_16way_context *ctx )
{
ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = m512_const1_64( 0x7071727370717273 );
ctx->H[13] = m512_const1_64( 0x7475767774757677 );
ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
ctx->H[ 0] = _mm512_set1_epi64( 0x4041424340414243 );
ctx->H[ 1] = _mm512_set1_epi64( 0x4445464744454647 );
ctx->H[ 2] = _mm512_set1_epi64( 0x48494A4B48494A4B );
ctx->H[ 3] = _mm512_set1_epi64( 0x4C4D4E4F4C4D4E4F );
ctx->H[ 4] = _mm512_set1_epi64( 0x5051525350515253 );
ctx->H[ 5] = _mm512_set1_epi64( 0x5455565754555657 );
ctx->H[ 6] = _mm512_set1_epi64( 0x58595A5B58595A5B );
ctx->H[ 7] = _mm512_set1_epi64( 0x5C5D5E5F5C5D5E5F );
ctx->H[ 8] = _mm512_set1_epi64( 0x6061626360616263 );
ctx->H[ 9] = _mm512_set1_epi64( 0x6465666764656667 );
ctx->H[10] = _mm512_set1_epi64( 0x68696A6B68696A6B );
ctx->H[11] = _mm512_set1_epi64( 0x6C6D6E6F6C6D6E6F );
ctx->H[12] = _mm512_set1_epi64( 0x7071727370717273 );
ctx->H[13] = _mm512_set1_epi64( 0x7475767774757677 );
ctx->H[14] = _mm512_set1_epi64( 0x78797A7B78797A7B );
ctx->H[15] = _mm512_set1_epi64( 0x7C7D7E7F7C7D7E7F );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -1531,7 +1531,7 @@ void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
ptr += 4;
h = ctx->H;

View File

@@ -896,22 +896,22 @@ static const __m256i final_b[16] =
static void
bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
{
sc->H[ 0] = m256_const1_64( 0x8081828384858687 );
sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F );
sc->H[ 2] = m256_const1_64( 0x9091929394959697 );
sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F );
sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 );
sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF );
sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 );
sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF );
sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 );
sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF );
sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 );
sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF );
sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 );
sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF );
sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 );
sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF );
sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
sc->H[ 2] = _mm256_set1_epi64x( 0x9091929394959697 );
sc->H[ 3] = _mm256_set1_epi64x( 0x98999A9B9C9D9E9F );
sc->H[ 4] = _mm256_set1_epi64x( 0xA0A1A2A3A4A5A6A7 );
sc->H[ 5] = _mm256_set1_epi64x( 0xA8A9AAABACADAEAF );
sc->H[ 6] = _mm256_set1_epi64x( 0xB0B1B2B3B4B5B6B7 );
sc->H[ 7] = _mm256_set1_epi64x( 0xB8B9BABBBCBDBEBF );
sc->H[ 8] = _mm256_set1_epi64x( 0xC0C1C2C3C4C5C6C7 );
sc->H[ 9] = _mm256_set1_epi64x( 0xC8C9CACBCCCDCECF );
sc->H[10] = _mm256_set1_epi64x( 0xD0D1D2D3D4D5D6D7 );
sc->H[11] = _mm256_set1_epi64x( 0xD8D9DADBDCDDDEDF );
sc->H[12] = _mm256_set1_epi64x( 0xE0E1E2E3E4E5E6E7 );
sc->H[13] = _mm256_set1_epi64x( 0xE8E9EAEBECEDEEEF );
sc->H[14] = _mm256_set1_epi64x( 0xF0F1F2F3F4F5F6F7 );
sc->H[15] = _mm256_set1_epi64x( 0xF8F9FAFBFCFDFEFF );
sc->ptr = 0;
sc->bit_count = 0;
}
@@ -967,7 +967,7 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>3 ] = m256_const1_64( 0x80 );
buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
ptr += 8;
h = sc->H;
@@ -1379,22 +1379,22 @@ static const __m512i final_b8[16] =
void bmw512_8way_init( bmw512_8way_context *ctx )
//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
{
ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
ctx->H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
ctx->H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
ctx->H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
ctx->H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
ctx->H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
ctx->H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
ctx->H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
ctx->H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
ctx->H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
ctx->H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
ctx->H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
ctx->H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
ctx->H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
ctx->H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );
ctx->ptr = 0;
ctx->bit_count = 0;
}
@@ -1448,7 +1448,7 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
buf = ctx->buf;
ptr = ctx->ptr;
buf[ ptr>>3 ] = m512_const1_64( 0x80 );
buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
ptr += 8;
h = ctx->H;
@@ -1483,22 +1483,22 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
// Init
H[ 0] = m512_const1_64( 0x8081828384858687 );
H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
H[ 2] = m512_const1_64( 0x9091929394959697 );
H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );
// Update
@@ -1530,7 +1530,7 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
__m512i h1[16], h2[16];
size_t u, v;
buf[ ptr>>3 ] = m512_const1_64( 0x80 );
buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
ptr += 8;
if ( ptr > (buf_size - 8) )

View File

@@ -423,21 +423,6 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
// 2 way 128
// This isn't expected to be used with AVX512 so HW rotate intruction
// is assumed not avaiable.
// Use double buffering to optimize serial bit rotations. Full double
// buffering isn't practical because it needs twice as many registers
// with AVX2 having only half as many as AVX512.
#define ROL2( out0, out1, in0, in1, c ) \
{ \
__m256i t0 = _mm256_slli_epi32( in0, c ); \
__m256i t1 = _mm256_slli_epi32( in1, c ); \
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
out0 = _mm256_or_si256( out0, t0 ); \
out1 = _mm256_or_si256( out1, t1 ); \
}
static void transform_2way( cube_2way_context *sp )
{
int r;
@@ -460,8 +445,10 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
ROL2( y0, y1, x2, x3, 7 );
ROL2( x2, x3, x0, x1, 7 );
y0 = mm256_rol_32( x2, 7 );
y1 = mm256_rol_32( x3, 7 );
x2 = mm256_rol_32( x0, 7 );
x3 = mm256_rol_32( x1, 7 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( y1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
@@ -474,8 +461,10 @@ static void transform_2way( cube_2way_context *sp )
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
ROL2( y0, x1, x1, x0, 11 );
ROL2( y1, x3, x3, x2, 11 );
y0 = mm256_rol_32( x1, 11 );
x1 = mm256_rol_32( x0, 11 );
y1 = mm256_rol_32( x3, 11 );
x3 = mm256_rol_32( x2, 11 );
x0 = _mm256_xor_si256( y0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( y1, x6 );

View File

@@ -32,7 +32,7 @@ static void transform( cubehashParam *sp )
{
x1 = _mm512_add_epi32( x0, x1 );
x0 = mm512_swap_256( x0 );
x0 = mm512_rol_32( x0, 7 );
x0 = mm512_rol_32( x0, 7 );
x0 = _mm512_xor_si512( x0, x1 );
x1 = mm512_swap128_64( x1 );
x1 = _mm512_add_epi32( x0, x1 );
@@ -58,19 +58,18 @@ static void transform( cubehashParam *sp )
{
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = x0;
x0 = mm256_rol_32( x1, 7 );
x1 = mm256_rol_32( y0, 7 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
y0 = mm256_rol_32( x1, 7 );
y1 = mm256_rol_32( x0, 7 );
x0 = _mm256_xor_si256( y0, x2 );
x1 = _mm256_xor_si256( y1, x3 );
x2 = mm256_swap128_64( x2 );
x3 = mm256_swap128_64( x3 );
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = mm256_swap_128( x0 );
y1 = mm256_swap_128( x1 );
x0 = mm256_rol_32( y0, 11 );
x1 = mm256_rol_32( y1, 11 );
x0 = mm256_swap_128( x0 );
x1 = mm256_swap_128( x1 );
x0 = mm256_rol_32( x0, 11 );
x1 = mm256_rol_32( x1, 11 );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = mm256_swap64_32( x2 );
@@ -94,47 +93,48 @@ static void transform( cubehashParam *sp )
x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
x7 = _mm_load_si128( (__m128i*)sp->x + 7 );
for (r = 0; r < rounds; ++r) {
x4 = _mm_add_epi32(x0, x4);
x5 = _mm_add_epi32(x1, x5);
x6 = _mm_add_epi32(x2, x6);
x7 = _mm_add_epi32(x3, x7);
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
x0 = _mm_xor_si128(x0, x4);
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_shuffle_epi32(x4, 0x4e);
x5 = _mm_shuffle_epi32(x5, 0x4e);
x6 = _mm_shuffle_epi32(x6, 0x4e);
x7 = _mm_shuffle_epi32(x7, 0x4e);
x4 = _mm_add_epi32(x0, x4);
x5 = _mm_add_epi32(x1, x5);
x6 = _mm_add_epi32(x2, x6);
x7 = _mm_add_epi32(x3, x7);
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
x0 = _mm_xor_si128(x0, x4);
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_shuffle_epi32(x4, 0xb1);
x5 = _mm_shuffle_epi32(x5, 0xb1);
x6 = _mm_shuffle_epi32(x6, 0xb1);
x7 = _mm_shuffle_epi32(x7, 0xb1);
for ( r = 0; r < rounds; ++r )
{
x4 = _mm_add_epi32( x0, x4 );
x5 = _mm_add_epi32( x1, x5 );
x6 = _mm_add_epi32( x2, x6 );
x7 = _mm_add_epi32( x3, x7 );
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = mm128_rol_32( y0, 7 );
x1 = mm128_rol_32( y1, 7 );
x2 = mm128_rol_32( y2, 7 );
x3 = mm128_rol_32( y3, 7 );
x0 = _mm_xor_si128( x0, x4 );
x1 = _mm_xor_si128( x1, x5 );
x2 = _mm_xor_si128( x2, x6 );
x3 = _mm_xor_si128( x3, x7 );
x4 = _mm_shuffle_epi32( x4, 0x4e );
x5 = _mm_shuffle_epi32( x5, 0x4e );
x6 = _mm_shuffle_epi32( x6, 0x4e );
x7 = _mm_shuffle_epi32( x7, 0x4e );
x4 = _mm_add_epi32( x0, x4 );
x5 = _mm_add_epi32( x1, x5 );
x6 = _mm_add_epi32( x2, x6 );
x7 = _mm_add_epi32( x3, x7 );
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = mm128_rol_32( y0, 11 );
x1 = mm128_rol_32( y1, 11 );
x2 = mm128_rol_32( y2, 11 );
x3 = mm128_rol_32( y3, 11 );
x0 = _mm_xor_si128( x0, x4 );
x1 = _mm_xor_si128( x1, x5 );
x2 = _mm_xor_si128( x2, x6 );
x3 = _mm_xor_si128( x3, x7 );
x4 = _mm_shuffle_epi32( x4, 0xb1 );
x5 = _mm_shuffle_epi32( x5, 0xb1 );
x6 = _mm_shuffle_epi32( x6, 0xb1 );
x7 = _mm_shuffle_epi32( x7, 0xb1 );
}
_mm_store_si128( (__m128i*)sp->x, x0 );
@@ -180,25 +180,25 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
if ( hashbitlen == 512 )
{
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
}
else
{
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
}
return SUCCESS;
@@ -234,10 +234,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
m128_const_64( 0, 0x80 ) );
_mm_set_epi64x( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
transform( sp );
transform( sp );
transform( sp );
@@ -279,10 +279,10 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
m128_const_64( 0, 0x80 ) );
_mm_set_epi64x( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
transform( sp );
transform( sp );
@@ -313,25 +313,25 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
if ( hashbitlen == 512 )
{
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
}
else
{
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
}
@@ -358,10 +358,10 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
m128_const_64( 0, 0x80 ) );
_mm_set_epi64x( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
transform( sp );
transform( sp );

View File

@@ -566,16 +566,16 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
state->uHashSize = 256;
state->uBlockLength = 192;
state->uRounds = 8;
state->hashsize = m128_const_64( 0, 0x100 );
state->const1536 = m128_const_64( 0, 0x600 );
state->hashsize = _mm_set_epi64x( 0, 0x100 );
state->const1536 = _mm_set_epi64x( 0, 0x600 );
break;
case 512:
state->uHashSize = 512;
state->uBlockLength = 128;
state->uRounds = 10;
state->hashsize = m128_const_64( 0, 0x200 );
state->const1536 = m128_const_64( 0, 0x400 );
state->hashsize = _mm_set_epi64x( 0, 0x200 );
state->const1536 = _mm_set_epi64x( 0, 0x400 );
break;
default:

View File

@@ -469,8 +469,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
s2 = _mm256_xor_si256( s2, t2 );\
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
_mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
state2[ 0 ][ j ] = mm256_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -480,8 +479,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
s2 = _mm256_xor_si256( s2, t2 ); \
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
_mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
state2[ 1 ][ j ] = mm256_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -491,8 +489,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
s2 = _mm256_xor_si256( s2, t2 ); \
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
_mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
state2[ 2 ][ j ] = mm256_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
} while(0)

View File

@@ -33,11 +33,11 @@ MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
@@ -131,7 +131,7 @@ MYALIGN const unsigned int _IV512[] = {
t1 = _mm_srli_epi16(t0, 6);\
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
/*
#define PRESUPERMIX(x, t1, s1, s2, t2)\

View File

@@ -139,7 +139,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
@@ -237,7 +237,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\

View File

@@ -128,7 +128,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
@@ -226,7 +226,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
\
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
/* compute w_i : add y_{i+4} */\
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
MUL2(a0, b0, b1);\
a0 = _mm_xor_si128(a0, TEMP0);\
MUL2(a1, b0, b1);\
@@ -275,7 +275,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
b1 = _mm_set_epi64x( 0xffffffffffffffff, 0 ); \
a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
a1 = _mm_xor_si128( a1, b1 ); \
a2 = _mm_xor_si128( a2, b1 ); \

View File

@@ -31,7 +31,7 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
}
// The only non-zero in the IV is len. It can be hard coded.
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
@@ -48,7 +48,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;
@@ -116,7 +116,7 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
else
{
// add first padding
ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
ctx->buffer[rem_ptr] = _mm_set_epi64x( 0, 0x80 );
// add zero padding
for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
@@ -148,7 +148,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
ctx->buf_ptr = 0;
// --- update ---
@@ -182,7 +182,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
else
{
// add first padding
ctx->buffer[i] = m128_const_64( 0, 0x80 );
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
// add zero padding
for ( i += 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
@@ -239,7 +239,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
else
{
// add first padding
ctx->buffer[i] = m128_const_64( 0, 0x80 );
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
// add zero padding
for ( i += 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();

View File

@@ -46,7 +46,7 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
ctx->buffer[i] = _mm_setzero_si128();
}
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
ctx->chaining[ 3 ] = _mm_set_epi64x( 0, 0x0100000000000000 );
ctx->buf_ptr = 0;
ctx->rem_ptr = 0;

View File

@@ -539,7 +539,7 @@ static const __m256i SUBSH_MASK7_2WAY =
j = _mm256_cmpgt_epi8(j, i );\
i = _mm256_add_epi8(i, i);\
j = _mm256_and_si256(j, k);\
i = _mm256_xor_si256(i, j);\
i = mm256_xorand( i, j, k );\
}
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
@@ -550,7 +550,7 @@ static const __m256i SUBSH_MASK7_2WAY =
b0 = a2;\
a1 = _mm256_xor_si256(a1, a2);\
b1 = a3;\
a2 = _mm256_xor_si256(a2, a3);\
TEMP2 = _mm256_xor_si256(a2, a3);\
b2 = a4;\
a3 = _mm256_xor_si256(a3, a4);\
b3 = a5;\
@@ -562,34 +562,20 @@ static const __m256i SUBSH_MASK7_2WAY =
a7 = _mm256_xor_si256(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm256_xor_si256(b0, a4);\
b6 = _mm256_xor_si256(b6, a4);\
b1 = _mm256_xor_si256(b1, a5);\
b7 = _mm256_xor_si256(b7, a5);\
b2 = _mm256_xor_si256(b2, a6);\
b0 = _mm256_xor_si256(b0, a6);\
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm256_xor_si256(b3, a7);\
b1 = _mm256_xor_si256(b1, a7);\
TEMP1 = b1;\
b4 = _mm256_xor_si256(b4, a0);\
b2 = _mm256_xor_si256(b2, a0);\
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm256_xor_si256(b5, a1);\
b3 = _mm256_xor_si256(b3, a1);\
b1 = a1;\
b6 = _mm256_xor_si256(b6, a2);\
b4 = _mm256_xor_si256(b4, a2);\
TEMP2 = a2;\
b7 = _mm256_xor_si256(b7, a3);\
b5 = _mm256_xor_si256(b5, a3);\
\
TEMP0 = mm256_xor3( b0, a4, a6 ); \
TEMP1 = mm256_xor3( b1, a5, a7 ); \
b2 = mm256_xor3( b2, a6, a0 ); \
b0 = a0; \
b3 = mm256_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm256_xor3( b6, a4, TEMP2 ); \
b4 = mm256_xor3( b4, a0, TEMP2 ); \
b7 = mm256_xor3( b7, a5, a3 ); \
b5 = mm256_xor3( b5, a1, a3 ); \
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm256_xor_si256(a0, a3);\
a1 = _mm256_xor_si256(a1, a4);\
a2 = _mm256_xor_si256(a2, a5);\
a2 = _mm256_xor_si256( TEMP2, a5);\
a3 = _mm256_xor_si256(a3, a6);\
a4 = _mm256_xor_si256(a4, a7);\
a5 = _mm256_xor_si256(a5, b0);\
@@ -671,7 +657,6 @@ static const __m256i SUBSH_MASK7_2WAY =
\
/* MixBytes */\
MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
\
}
/* 10 rounds, P and Q in parallel */

View File

@@ -710,7 +710,7 @@ static const __m256i SUBSH_MASK7_2WAY =
b0 = a2;\
a1 = _mm256_xor_si256(a1, a2);\
b1 = a3;\
a2 = _mm256_xor_si256(a2, a3);\
TEMP2 = _mm256_xor_si256(a2, a3);\
b2 = a4;\
a3 = _mm256_xor_si256(a3, a4);\
b3 = a5;\
@@ -722,34 +722,23 @@ static const __m256i SUBSH_MASK7_2WAY =
a7 = _mm256_xor_si256(a7, b6);\
\
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
b0 = _mm256_xor_si256(b0, a4);\
b6 = _mm256_xor_si256(b6, a4);\
b1 = _mm256_xor_si256(b1, a5);\
b7 = _mm256_xor_si256(b7, a5);\
b2 = _mm256_xor_si256(b2, a6);\
b0 = _mm256_xor_si256(b0, a6);\
TEMP0 = mm256_xor3( b0, a4, a6 ); \
/* spill values y_4, y_5 to memory */\
TEMP0 = b0;\
b3 = _mm256_xor_si256(b3, a7);\
b1 = _mm256_xor_si256(b1, a7);\
TEMP1 = b1;\
b4 = _mm256_xor_si256(b4, a0);\
b2 = _mm256_xor_si256(b2, a0);\
TEMP1 = mm256_xor3( b1, a5, a7 ); \
b2 = mm256_xor3( b2, a6, a0 ); \
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
b0 = a0;\
b5 = _mm256_xor_si256(b5, a1);\
b3 = _mm256_xor_si256(b3, a1);\
b1 = a1;\
b6 = _mm256_xor_si256(b6, a2);\
b4 = _mm256_xor_si256(b4, a2);\
TEMP2 = a2;\
b7 = _mm256_xor_si256(b7, a3);\
b5 = _mm256_xor_si256(b5, a3);\
b0 = a0; \
b3 = mm256_xor3( b3, a7, a1 ); \
b1 = a1; \
b6 = mm256_xor3( b6, a4, TEMP2 ); \
b4 = mm256_xor3( b4, a0, TEMP2 ); \
b7 = mm256_xor3( b7, a5, a3 ); \
b5 = mm256_xor3( b5, a1, a3 ); \
\
/* compute x_i = t_i + t_{i+3} */\
a0 = _mm256_xor_si256(a0, a3);\
a1 = _mm256_xor_si256(a1, a4);\
a2 = _mm256_xor_si256(a2, a5);\
a2 = _mm256_xor_si256( TEMP2, a5);\
a3 = _mm256_xor_si256(a3, a6);\
a4 = _mm256_xor_si256(a4, a7);\
a5 = _mm256_xor_si256(a5, b0);\

View File

@@ -562,14 +562,14 @@ do { \
for ( int u = 0; u < 64; u++ ) \
{ \
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
m0 = _mm512_mask_xor_epi64( m0, dm, m0, _mm512_set1_epi64( tp[0] ) ); \
m1 = _mm512_mask_xor_epi64( m1, dm, m1, _mm512_set1_epi64( tp[1] ) ); \
m2 = _mm512_mask_xor_epi64( m2, dm, m2, _mm512_set1_epi64( tp[2] ) ); \
m3 = _mm512_mask_xor_epi64( m3, dm, m3, _mm512_set1_epi64( tp[3] ) ); \
m4 = _mm512_mask_xor_epi64( m4, dm, m4, _mm512_set1_epi64( tp[4] ) ); \
m5 = _mm512_mask_xor_epi64( m5, dm, m5, _mm512_set1_epi64( tp[5] ) ); \
m6 = _mm512_mask_xor_epi64( m6, dm, m6, _mm512_set1_epi64( tp[6] ) ); \
m7 = _mm512_mask_xor_epi64( m7, dm, m7, _mm512_set1_epi64( tp[7] ) ); \
db = _mm512_ror_epi64( db, 1 ); \
tp += 8; \
} \
@@ -733,17 +733,17 @@ do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
@@ -752,29 +752,29 @@ do { \
__m512i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (10ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
alpha[0] = _mm512_set1_epi64( (11ULL << 32) ^ A0 ); \
ROUND_BIG8( alpha ); \
} while (0)
@@ -829,14 +829,14 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
sc->h[0] = m512_const1_64( 0x6c70617273746565 );
sc->h[1] = m512_const1_64( 0x656e62656b204172 );
sc->h[2] = m512_const1_64( 0x302c206272672031 );
sc->h[3] = m512_const1_64( 0x3434362c75732032 );
sc->h[4] = m512_const1_64( 0x3030312020422d33 );
sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
sc->h[7] = m512_const1_64( 0x6769756d2042656c );
sc->h[0] = _mm512_set1_epi64( 0x6c70617273746565 );
sc->h[1] = _mm512_set1_epi64( 0x656e62656b204172 );
sc->h[2] = _mm512_set1_epi64( 0x302c206272672031 );
sc->h[3] = _mm512_set1_epi64( 0x3434362c75732032 );
sc->h[4] = _mm512_set1_epi64( 0x3030312020422d33 );
sc->h[5] = _mm512_set1_epi64( 0x656e2d484c657576 );
sc->h[6] = _mm512_set1_epi64( 0x6c65652c65766572 );
sc->h[7] = _mm512_set1_epi64( 0x6769756d2042656c );
}
void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
@@ -859,7 +859,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
sc->buf[0] = m512_const1_64( 0x80 );
sc->buf[0] = _mm512_set1_epi64( 0x80 );
hamsi_8way_big( sc, sc->buf, 1 );
hamsi_8way_big_final( sc, pad );
@@ -870,6 +870,32 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
// Hamsi 4 way AVX2
#if defined(__AVX512VL__)
#define INPUT_BIG \
do { \
__m256i db = _mm256_ror_epi64( *buf, 1 ); \
const __m256i zero = m256_zero; \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \
m0 = _mm256_mask_xor_epi64( m0, dm, m0, _mm256_set1_epi64x( tp[0] ) ); \
m1 = _mm256_mask_xor_epi64( m1, dm, m1, _mm256_set1_epi64x( tp[1] ) ); \
m2 = _mm256_mask_xor_epi64( m2, dm, m2, _mm256_set1_epi64x( tp[2] ) ); \
m3 = _mm256_mask_xor_epi64( m3, dm, m3, _mm256_set1_epi64x( tp[3] ) ); \
m4 = _mm256_mask_xor_epi64( m4, dm, m4, _mm256_set1_epi64x( tp[4] ) ); \
m5 = _mm256_mask_xor_epi64( m5, dm, m5, _mm256_set1_epi64x( tp[5] ) ); \
m6 = _mm256_mask_xor_epi64( m6, dm, m6, _mm256_set1_epi64x( tp[6] ) ); \
m7 = _mm256_mask_xor_epi64( m7, dm, m7, _mm256_set1_epi64x( tp[7] ) ); \
db = _mm256_ror_epi64( db, 1 ); \
tp += 8; \
} \
} while (0)
#else
#define INPUT_BIG \
do { \
__m256i db = *buf; \
@@ -880,25 +906,58 @@ do { \
{ \
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
m256_const1_64( tp[0] ) ) ); \
_mm256_set1_epi64x( tp[0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
m256_const1_64( tp[1] ) ) ); \
_mm256_set1_epi64x( tp[1] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
m256_const1_64( tp[2] ) ) ); \
_mm256_set1_epi64x( tp[2] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
m256_const1_64( tp[3] ) ) ); \
_mm256_set1_epi64x( tp[3] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
m256_const1_64( tp[4] ) ) ); \
_mm256_set1_epi64x( tp[4] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
m256_const1_64( tp[5] ) ) ); \
_mm256_set1_epi64x( tp[5] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
m256_const1_64( tp[6] ) ) ); \
_mm256_set1_epi64x( tp[6] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
m256_const1_64( tp[7] ) ) ); \
_mm256_set1_epi64x( tp[7] ) ) ); \
tp += 8; \
} \
} while (0)
#endif
#define SBOX( a, b, c, d ) \
do { \
__m256i t; \
t = a; \
a = mm256_xorand( d, a, c ); \
c = mm256_xor3( a, b, c ); \
b = mm256_xoror( b, d, t ); \
t = _mm256_xor_si256( t, c ); \
d = mm256_xoror( a, b, t ); \
t = mm256_xorand( t, a, b ); \
a = c; \
c = mm256_xor3( b, d, t ); \
b = d; \
d = mm256_not( t ); \
} while (0)
#define L( a, b, c, d ) \
do { \
a = mm256_rol_32( a, 13 ); \
c = mm256_rol_32( c, 3 ); \
b = mm256_xor3( a, b, c ); \
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
b = mm256_rol_32( b, 1 ); \
d = mm256_rol_32( d, 7 ); \
a = mm256_xor3( a, b, d ); \
c = mm256_xor3( c, d, _mm256_slli_epi32( b, 7 ) ); \
a = mm256_rol_32( a, 5 ); \
c = mm256_rol_32( c, 22 ); \
} while (0)
/*
#define SBOX( a, b, c, d ) \
do { \
__m256i t; \
@@ -937,6 +996,7 @@ do { \
a = mm256_rol_32( a, 5 ); \
c = mm256_rol_32( c, 22 ); \
} while (0)
*/
#define DECL_STATE_BIG \
__m256i c0, c1, c2, c3, c4, c5, c6, c7; \
@@ -1066,17 +1126,17 @@ do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_n )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
@@ -1085,29 +1145,29 @@ do { \
__m256i alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_f )[i] ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 1ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 2ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 3ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 4ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 5ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 6ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 7ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 8ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( ( 9ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (10ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
alpha[0] = _mm256_set1_epi64x( (11ULL << 32) ^ A0 ); \
ROUND_BIG( alpha ); \
} while (0)
@@ -1163,14 +1223,14 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
sc->partial_len = 0;
sc->count_high = sc->count_low = 0;
sc->h[0] = m256_const1_64( 0x6c70617273746565 );
sc->h[1] = m256_const1_64( 0x656e62656b204172 );
sc->h[2] = m256_const1_64( 0x302c206272672031 );
sc->h[3] = m256_const1_64( 0x3434362c75732032 );
sc->h[4] = m256_const1_64( 0x3030312020422d33 );
sc->h[5] = m256_const1_64( 0x656e2d484c657576 );
sc->h[6] = m256_const1_64( 0x6c65652c65766572 );
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
sc->h[0] = _mm256_set1_epi64x( 0x6c70617273746565 );
sc->h[1] = _mm256_set1_epi64x( 0x656e62656b204172 );
sc->h[2] = _mm256_set1_epi64x( 0x302c206272672031 );
sc->h[3] = _mm256_set1_epi64x( 0x3434362c75732032 );
sc->h[4] = _mm256_set1_epi64x( 0x3030312020422d33 );
sc->h[5] = _mm256_set1_epi64x( 0x656e2d484c657576 );
sc->h[6] = _mm256_set1_epi64x( 0x6c65652c65766572 );
sc->h[7] = _mm256_set1_epi64x( 0x6769756d2042656c );
}
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
@@ -1193,7 +1253,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch );
sc->buf[0] = m256_const1_64( 0x80 );
sc->buf[0] = _mm256_set1_epi64x( 0x80 );
hamsi_big( sc, sc->buf, 1 );
hamsi_big_final( sc, pad );

View File

@@ -52,6 +52,56 @@ extern "C"{
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#if defined(__AVX512VL__)
// ( ~( a ^ b ) ) & c
#define mm128_andnotxor( a, b, c ) \
_mm_ternarylogic_epi32( a, b, c, 0x82 )
#else
#define mm128_andnotxor( a, b, c ) \
_mm_andnot_si128( _mm_xor_si128( a, b ), c )
#endif
#define F1(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) ) \
#define F2(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 ) ), \
mm128_andxor( x4, x1, x5 ), \
mm128_xorand( x0, x3, x5 ) ) \
#define F3(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, \
_mm_and_si128( x3, \
mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
_mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ) )
#define F4(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( \
mm128_andxor( x3, x5, \
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_or_si128( x4, x6 ) ) ), \
_mm_and_si128( x4, \
mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
_mm_xor_si128( x1, x6 ) ) ), \
mm128_xorand( x0, x2, x6 ) )
#define F5(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
mm128_xor3( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) )
/*
#define F1(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( x0, \
_mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
@@ -96,6 +146,7 @@ extern "C"{
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), \
_mm_and_si128( x3, x6 ) ) )
*/
/*
* The macros below integrate the phi() permutations, depending on the
@@ -740,14 +791,14 @@ do { \
static void
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = m256_const1_32( 0x243F6A88UL );
sc->s1 = m256_const1_32( 0x85A308D3UL );
sc->s2 = m256_const1_32( 0x13198A2EUL );
sc->s3 = m256_const1_32( 0x03707344UL );
sc->s4 = m256_const1_32( 0xA4093822UL );
sc->s5 = m256_const1_32( 0x299F31D0UL );
sc->s6 = m256_const1_32( 0x082EFA98UL );
sc->s7 = m256_const1_32( 0xEC4E6C89UL );
sc->s0 = _mm256_set1_epi32( 0x243F6A88UL );
sc->s1 = _mm256_set1_epi32( 0x85A308D3UL );
sc->s2 = _mm256_set1_epi32( 0x13198A2EUL );
sc->s3 = _mm256_set1_epi32( 0x03707344UL );
sc->s4 = _mm256_set1_epi32( 0xA4093822UL );
sc->s5 = _mm256_set1_epi32( 0x299F31D0UL );
sc->s6 = _mm256_set1_epi32( 0x082EFA98UL );
sc->s7 = _mm256_set1_epi32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;

View File

@@ -76,19 +76,31 @@ do { \
#endif
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#define notxorandnot( a, b, c ) \
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
#else
#define notxorandnot( a, b, c ) \
_mm256_xor_si256( mm256_not( a ), _mm256_andnot_si256( b, c ) )
#endif
#define Sb(x0, x1, x2, x3, c) \
do { \
const __m256i cc = _mm256_set1_epi64x( c ); \
x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
const __m256i cc = _mm256_set1_epi64x( c ); \
x0 = mm256_xorandnot( x0, x2, cc ); \
tmp = mm256_xorand( cc, x0, x1 ); \
x0 = mm256_xorandnot( x0, x3, x2 ); \
x3 = notxorandnot( x3, x1, x2 ); \
x1 = mm256_xorand( x1, x0, x2 ); \
x2 = mm256_xorandnot( x2, x3, x0 ); \
x0 = mm256_xoror( x0, x1, x3 ); \
x3 = mm256_xorand( x3, x1, x2 ); \
x1 = mm256_xorand( x1, tmp, x0 ); \
x2 = _mm256_xor_si256( x2, tmp ); \
} while (0)
@@ -96,11 +108,11 @@ do { \
do { \
x4 = _mm256_xor_si256( x4, x1 ); \
x5 = _mm256_xor_si256( x5, x2 ); \
x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
x6 = mm256_xor3( x6, x3, x0 ); \
x7 = _mm256_xor_si256( x7, x0 ); \
x0 = _mm256_xor_si256( x0, x5 ); \
x1 = _mm256_xor_si256( x1, x6 ); \
x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
x2 = mm256_xor3( x2, x7, x4 ); \
x3 = _mm256_xor_si256( x3, x4 ); \
} while (0)
@@ -323,12 +335,12 @@ do { \
} while (0)
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W83(x) Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ), 8 )
#define W84(x) Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
#define W85(x) Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
#define W80(x) Wz_8W(x, _mm512_set1_epi64( 0x5555555555555555 ), 1 )
#define W81(x) Wz_8W(x, _mm512_set1_epi64( 0x3333333333333333 ), 2 )
#define W82(x) Wz_8W(x, _mm512_set1_epi64( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W83(x) Wz_8W(x, _mm512_set1_epi64( 0x00FF00FF00FF00FF ), 8 )
#define W84(x) Wz_8W(x, _mm512_set1_epi64( 0x0000FFFF0000FFFF ), 16 )
#define W85(x) Wz_8W(x, _mm512_set1_epi64( 0x00000000FFFFFFFF ), 32 )
#define W86(x) \
do { \
__m512i t = x ## h; \
@@ -352,12 +364,12 @@ do { \
x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
} while (0)
#define W0(x) Wz(x, m256_const1_64( 0x5555555555555555 ), 1 )
#define W1(x) Wz(x, m256_const1_64( 0x3333333333333333 ), 2 )
#define W2(x) Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W3(x) Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ), 8 )
#define W4(x) Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 )
#define W5(x) Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 )
#define W0(x) Wz(x, _mm256_set1_epi64x( 0x5555555555555555 ), 1 )
#define W1(x) Wz(x, _mm256_set1_epi64x( 0x3333333333333333 ), 2 )
#define W2(x) Wz(x, _mm256_set1_epi64x( 0x0F0F0F0F0F0F0F0F ), 4 )
#define W3(x) Wz(x, _mm256_set1_epi64x( 0x00FF00FF00FF00FF ), 8 )
#define W4(x) Wz(x, _mm256_set1_epi64x( 0x0000FFFF0000FFFF ), 16 )
#define W5(x) Wz(x, _mm256_set1_epi64x( 0x00000000FFFFFFFF ), 32 )
#define W6(x) \
do { \
__m256i t = x ## h; \
@@ -624,22 +636,22 @@ static const sph_u64 IV512[] = {
void jh256_8way_init( jh_8way_context *sc )
{
// bswapped IV256
sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
sc->H[ 0] = _mm512_set1_epi64( 0xebd3202c41a398eb );
sc->H[ 1] = _mm512_set1_epi64( 0xc145b29c7bbecd92 );
sc->H[ 2] = _mm512_set1_epi64( 0xfac7d4609151931c );
sc->H[ 3] = _mm512_set1_epi64( 0x038a507ed6820026 );
sc->H[ 4] = _mm512_set1_epi64( 0x45b92677269e23a4 );
sc->H[ 5] = _mm512_set1_epi64( 0x77941ad4481afbe0 );
sc->H[ 6] = _mm512_set1_epi64( 0x7a176b0226abb5cd );
sc->H[ 7] = _mm512_set1_epi64( 0xa82fff0f4224f056 );
sc->H[ 8] = _mm512_set1_epi64( 0x754d2e7f8996a371 );
sc->H[ 9] = _mm512_set1_epi64( 0x62e27df70849141d );
sc->H[10] = _mm512_set1_epi64( 0x948f2476f7957627 );
sc->H[11] = _mm512_set1_epi64( 0x6c29804757b6d587 );
sc->H[12] = _mm512_set1_epi64( 0x6c0d8eac2d275e5c );
sc->H[13] = _mm512_set1_epi64( 0x0f7a0557c6508451 );
sc->H[14] = _mm512_set1_epi64( 0xea12247067d3e47b );
sc->H[15] = _mm512_set1_epi64( 0x69d71cd313abe389 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -647,22 +659,22 @@ void jh256_8way_init( jh_8way_context *sc )
void jh512_8way_init( jh_8way_context *sc )
{
// bswapped IV512
sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
sc->H[ 0] = _mm512_set1_epi64( 0x17aa003e964bd16f );
sc->H[ 1] = _mm512_set1_epi64( 0x43d5157a052e6a63 );
sc->H[ 2] = _mm512_set1_epi64( 0x0bef970c8d5e228a );
sc->H[ 3] = _mm512_set1_epi64( 0x61c3b3f2591234e9 );
sc->H[ 4] = _mm512_set1_epi64( 0x1e806f53c1a01d89 );
sc->H[ 5] = _mm512_set1_epi64( 0x806d2bea6b05a92a );
sc->H[ 6] = _mm512_set1_epi64( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = _mm512_set1_epi64( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = _mm512_set1_epi64( 0x694ae34105e66901 );
sc->H[ 9] = _mm512_set1_epi64( 0x5ae66f2e8e8ab546 );
sc->H[10] = _mm512_set1_epi64( 0x243c84c1d0a74710 );
sc->H[11] = _mm512_set1_epi64( 0x99c15a2db1716e3b );
sc->H[12] = _mm512_set1_epi64( 0x56f8b19decf657cf );
sc->H[13] = _mm512_set1_epi64( 0x56b116577c8806a7 );
sc->H[14] = _mm512_set1_epi64( 0xfb1785e6dffcc2e3 );
sc->H[15] = _mm512_set1_epi64( 0x4bdd8ccc78465a54 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -721,7 +733,7 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
size_t numz, u;
uint64_t l0, l1;
buf[0] = m512_const1_64( 0x80ULL );
buf[0] = _mm512_set1_epi64( 0x80ULL );
if ( sc->ptr == 0 )
numz = 48;
@@ -772,22 +784,22 @@ jh512_8way_close(void *cc, void *dst)
void jh256_4way_init( jh_4way_context *sc )
{
// bswapped IV256
sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb );
sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 );
sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c );
sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 );
sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 );
sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 );
sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd );
sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 );
sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 );
sc->H[ 9] = m256_const1_64( 0x62e27df70849141d );
sc->H[10] = m256_const1_64( 0x948f2476f7957627 );
sc->H[11] = m256_const1_64( 0x6c29804757b6d587 );
sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c );
sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 );
sc->H[14] = m256_const1_64( 0xea12247067d3e47b );
sc->H[15] = m256_const1_64( 0x69d71cd313abe389 );
sc->H[ 0] = _mm256_set1_epi64x( 0xebd3202c41a398eb );
sc->H[ 1] = _mm256_set1_epi64x( 0xc145b29c7bbecd92 );
sc->H[ 2] = _mm256_set1_epi64x( 0xfac7d4609151931c );
sc->H[ 3] = _mm256_set1_epi64x( 0x038a507ed6820026 );
sc->H[ 4] = _mm256_set1_epi64x( 0x45b92677269e23a4 );
sc->H[ 5] = _mm256_set1_epi64x( 0x77941ad4481afbe0 );
sc->H[ 6] = _mm256_set1_epi64x( 0x7a176b0226abb5cd );
sc->H[ 7] = _mm256_set1_epi64x( 0xa82fff0f4224f056 );
sc->H[ 8] = _mm256_set1_epi64x( 0x754d2e7f8996a371 );
sc->H[ 9] = _mm256_set1_epi64x( 0x62e27df70849141d );
sc->H[10] = _mm256_set1_epi64x( 0x948f2476f7957627 );
sc->H[11] = _mm256_set1_epi64x( 0x6c29804757b6d587 );
sc->H[12] = _mm256_set1_epi64x( 0x6c0d8eac2d275e5c );
sc->H[13] = _mm256_set1_epi64x( 0x0f7a0557c6508451 );
sc->H[14] = _mm256_set1_epi64x( 0xea12247067d3e47b );
sc->H[15] = _mm256_set1_epi64x( 0x69d71cd313abe389 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -795,22 +807,22 @@ void jh256_4way_init( jh_4way_context *sc )
void jh512_4way_init( jh_4way_context *sc )
{
// bswapped IV512
sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f );
sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 );
sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a );
sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 );
sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 );
sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a );
sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 );
sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 );
sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 );
sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b );
sc->H[12] = m256_const1_64( 0x56f8b19decf657cf );
sc->H[13] = m256_const1_64( 0x56b116577c8806a7 );
sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 );
sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 );
sc->H[ 0] = _mm256_set1_epi64x( 0x17aa003e964bd16f );
sc->H[ 1] = _mm256_set1_epi64x( 0x43d5157a052e6a63 );
sc->H[ 2] = _mm256_set1_epi64x( 0x0bef970c8d5e228a );
sc->H[ 3] = _mm256_set1_epi64x( 0x61c3b3f2591234e9 );
sc->H[ 4] = _mm256_set1_epi64x( 0x1e806f53c1a01d89 );
sc->H[ 5] = _mm256_set1_epi64x( 0x806d2bea6b05a92a );
sc->H[ 6] = _mm256_set1_epi64x( 0xa6ba7520dbcc8e58 );
sc->H[ 7] = _mm256_set1_epi64x( 0xf73bf8ba763a0fa9 );
sc->H[ 8] = _mm256_set1_epi64x( 0x694ae34105e66901 );
sc->H[ 9] = _mm256_set1_epi64x( 0x5ae66f2e8e8ab546 );
sc->H[10] = _mm256_set1_epi64x( 0x243c84c1d0a74710 );
sc->H[11] = _mm256_set1_epi64x( 0x99c15a2db1716e3b );
sc->H[12] = _mm256_set1_epi64x( 0x56f8b19decf657cf );
sc->H[13] = _mm256_set1_epi64x( 0x56b116577c8806a7 );
sc->H[14] = _mm256_set1_epi64x( 0xfb1785e6dffcc2e3 );
sc->H[15] = _mm256_set1_epi64x( 0x4bdd8ccc78465a54 );
sc->ptr = 0;
sc->block_count = 0;
}
@@ -869,7 +881,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
size_t numz, u;
uint64_t l0, l1;
buf[0] = m256_const1_64( 0x80ULL );
buf[0] = _mm256_set1_epi64x( 0x80ULL );
if ( sc->ptr == 0 )
numz = 48;

View File

@@ -49,7 +49,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
@@ -101,7 +101,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
pdata[19] = n;

View File

@@ -180,15 +180,15 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = m512_const1_64( t );
u.tmp[0] = _mm512_set1_epi64( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = m512_const1_64( eb );
u.tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
}
keccak64_8way_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */
@@ -264,8 +264,8 @@ keccak512_8way_close(void *cc, void *dst)
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = mm256_not( s ) )
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
#define XOROR(d, a, b, c) (d = mm256_xoror( a, b, c ) )
#define XORAND(d, a, b, c) (d = mm256_xorand( a, b, c ) )
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
#include "keccak-macros.c"
@@ -368,15 +368,15 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = m256_const1_64( t );
u.tmp[0] = _mm256_set1_epi64x( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = m256_const1_64( eb );
u.tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */

View File

@@ -56,7 +56,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
@@ -115,7 +115,7 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -69,7 +69,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
#define MULT24W( a0, a1 ) \
{ \
__m512i b = _mm512_xor_si512( a0, \
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 0x10 ) ); \
a0 = _mm512_alignr_epi8( a1, b, 4 ); \
a1 = _mm512_alignr_epi8( b, a1, 4 ); \
}
@@ -107,49 +107,37 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
ADD_CONSTANT4W( x0, x4, c0, c1 );
#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
a1 = _mm512_shuffle_epi32( a1, 147 ); \
t0 = _mm512_load_si512( &a1 ); \
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
t0 = _mm512_shuffle_epi32( a1, 147 ); \
a1 = _mm512_unpacklo_epi32( t0, a0 ); \
t0 = _mm512_unpackhi_epi32( t0, a0 ); \
t1 = _mm512_shuffle_epi32( t0, 78 ); \
a0 = _mm512_shuffle_epi32( a1, 78 ); \
SUBCRUMB4W( t1, t0, a0, a1 ); \
t0 = _mm512_unpacklo_epi32( t0, t1 ); \
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
a0 = _mm512_load_si512( &a1 ); \
a0 = _mm512_unpackhi_epi64( a0, t0 ); \
a0 = _mm512_unpackhi_epi64( a1, t0 ); \
a1 = _mm512_unpacklo_epi64( a1, t0 ); \
a1 = _mm512_shuffle_epi32( a1, 57 ); \
MIXWORD4W( a0, a1 ); \
ADD_CONSTANT4W( a0, a1, c0, c1 );
#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm512_load_si512(&r3);\
q1 = _mm512_load_si512(&p3);\
s3 = _mm512_load_si512(&r3);\
q3 = _mm512_load_si512(&p3);\
s1 = _mm512_unpackhi_epi32(s1,r2);\
q1 = _mm512_unpackhi_epi32(q1,p2);\
s3 = _mm512_unpacklo_epi32(s3,r2);\
q3 = _mm512_unpacklo_epi32(q3,p2);\
s0 = _mm512_load_si512(&s1);\
q0 = _mm512_load_si512(&q1);\
s2 = _mm512_load_si512(&s3);\
q2 = _mm512_load_si512(&q3);\
r3 = _mm512_load_si512(&r1);\
p3 = _mm512_load_si512(&p1);\
r1 = _mm512_unpacklo_epi32(r1,r0);\
p1 = _mm512_unpacklo_epi32(p1,p0);\
r3 = _mm512_unpackhi_epi32(r3,r0);\
p3 = _mm512_unpackhi_epi32(p3,p0);\
s0 = _mm512_unpackhi_epi64(s0,r3);\
q0 = _mm512_unpackhi_epi64(q0,p3);\
s1 = _mm512_unpacklo_epi64(s1,r3);\
q1 = _mm512_unpacklo_epi64(q1,p3);\
s2 = _mm512_unpackhi_epi64(s2,r1);\
q2 = _mm512_unpackhi_epi64(q2,p1);\
s3 = _mm512_unpacklo_epi64(s3,r1);\
q3 = _mm512_unpacklo_epi64(q3,p1);
s1 = _mm512_unpackhi_epi32( r3, r2 ); \
q1 = _mm512_unpackhi_epi32( p3, p2 ); \
s3 = _mm512_unpacklo_epi32( r3, r2 ); \
q3 = _mm512_unpacklo_epi32( p3, p2 ); \
r3 = _mm512_unpackhi_epi32( r1, r0 ); \
r1 = _mm512_unpacklo_epi32( r1, r0 ); \
p3 = _mm512_unpackhi_epi32( p1, p0 ); \
p1 = _mm512_unpacklo_epi32( p1, p0 ); \
s0 = _mm512_unpackhi_epi64( s1, r3 ); \
q0 = _mm512_unpackhi_epi64( q1 ,p3 ); \
s1 = _mm512_unpacklo_epi64( s1, r3 ); \
q1 = _mm512_unpacklo_epi64( q1, p3 ); \
s2 = _mm512_unpackhi_epi64( s3, r1 ); \
q2 = _mm512_unpackhi_epi64( q3, p1 ); \
s3 = _mm512_unpacklo_epi64( s3, r1 ); \
q3 = _mm512_unpacklo_epi64( q3, p1 );
#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -198,11 +186,8 @@ void rnd512_4way( luffa_4way_context *state, const __m512i *msg )
chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
MULT24W( chainv[8], chainv[9] );
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
t0 = chainv[8] = _mm512_xor_si512( chainv[8], t0 );
t1 = chainv[9] = _mm512_xor_si512( chainv[9], t1 );
MULT24W( chainv[8], chainv[9] );
chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
@@ -538,10 +523,39 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
#if defined(__AVX512VL__)
#define MULT2( a0, a1 ) \
{ \
__m256i b = _mm256_xor_si256( a0, \
_mm256_maskz_shuffle_epi32( 0xbb, a1, 0x10 ) ); \
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
}
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m256i t = a0; \
a0 = mm256_xoror( a3, a0, a1 ); \
a2 = _mm256_xor_si256( a2, a3 ); \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm256_xorand( a2, a3, t ); \
a2 = mm256_xorand( a1, a2, a0); \
a1 = _mm256_or_si256( a1, a3 ); \
a3 = _mm256_xor_si256( a3, a2 ); \
t = _mm256_xor_si256( t, a1 ); \
a2 = _mm256_and_si256( a2, a1 ); \
a1 = mm256_xnor( a1, a0 ); \
a0 = t; \
}
#else
#define MULT2( a0, a1 ) \
{ \
__m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
_mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
_mm256_blend_epi32( a1, m256_zero, 0xee ), 0x10 ) ); \
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
}
@@ -567,26 +581,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
a0 = t; \
}
#endif
#define MIXWORD( a, b ) \
{ \
__m256i t1, t2; \
b = _mm256_xor_si256( a,b ); \
t1 = _mm256_slli_epi32( a, 2 ); \
t2 = _mm256_srli_epi32( a, 30 ); \
a = _mm256_or_si256( t1, t2 ); \
a = _mm256_xor_si256( a, b ); \
t1 = _mm256_slli_epi32( b, 14 ); \
t2 = _mm256_srli_epi32( b, 18 ); \
b = _mm256_or_si256( t1, t2 ); \
b = _mm256_xor_si256( a, b ); \
t1 = _mm256_slli_epi32( a, 10 ); \
t2 = _mm256_srli_epi32( a, 22 ); \
a = _mm256_or_si256( t1,t2 ); \
a = _mm256_xor_si256( a,b ); \
t1 = _mm256_slli_epi32( b,1 ); \
t2 = _mm256_srli_epi32( b,31 ); \
b = _mm256_or_si256( t1, t2 ); \
}
b = _mm256_xor_si256( a, b ); \
a = _mm256_xor_si256( b, mm256_rol_32( a, 2 ) ); \
b = _mm256_xor_si256( a, mm256_rol_32( b, 14 ) ); \
a = _mm256_xor_si256( b, mm256_rol_32( a, 10 ) ); \
b = mm256_rol_32( b, 1 );
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
SUBCRUMB( x0, x1, x2, x3 ); \
@@ -598,49 +600,37 @@ int luffa_4way_update_close( luffa_4way_context *state,
ADD_CONSTANT( x0, x4, c0, c1 );
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
a1 = _mm256_shuffle_epi32( a1, 147); \
t0 = _mm256_load_si256( &a1 ); \
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
t0 = _mm256_shuffle_epi32( a1, 147 ); \
a1 = _mm256_unpacklo_epi32( t0, a0 ); \
t0 = _mm256_unpackhi_epi32( t0, a0 ); \
t1 = _mm256_shuffle_epi32( t0, 78 ); \
a0 = _mm256_shuffle_epi32( a1, 78 ); \
SUBCRUMB( t1, t0, a0, a1 );\
SUBCRUMB( t1, t0, a0, a1 ); \
t0 = _mm256_unpacklo_epi32( t0, t1 ); \
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
a0 = _mm256_load_si256( &a1 ); \
a0 = _mm256_unpackhi_epi64( a0, t0 ); \
a0 = _mm256_unpackhi_epi64( a1, t0 ); \
a1 = _mm256_unpacklo_epi64( a1, t0 ); \
a1 = _mm256_shuffle_epi32( a1, 57 ); \
MIXWORD( a0, a1 ); \
ADD_CONSTANT( a0, a1, c0, c1 );
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm256_load_si256(&r3);\
q1 = _mm256_load_si256(&p3);\
s3 = _mm256_load_si256(&r3);\
q3 = _mm256_load_si256(&p3);\
s1 = _mm256_unpackhi_epi32(s1,r2);\
q1 = _mm256_unpackhi_epi32(q1,p2);\
s3 = _mm256_unpacklo_epi32(s3,r2);\
q3 = _mm256_unpacklo_epi32(q3,p2);\
s0 = _mm256_load_si256(&s1);\
q0 = _mm256_load_si256(&q1);\
s2 = _mm256_load_si256(&s3);\
q2 = _mm256_load_si256(&q3);\
r3 = _mm256_load_si256(&r1);\
p3 = _mm256_load_si256(&p1);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
r3 = _mm256_unpackhi_epi32(r3,r0);\
p3 = _mm256_unpackhi_epi32(p3,p0);\
s0 = _mm256_unpackhi_epi64(s0,r3);\
q0 = _mm256_unpackhi_epi64(q0,p3);\
s1 = _mm256_unpacklo_epi64(s1,r3);\
q1 = _mm256_unpacklo_epi64(q1,p3);\
s2 = _mm256_unpackhi_epi64(s2,r1);\
q2 = _mm256_unpackhi_epi64(q2,p1);\
s3 = _mm256_unpacklo_epi64(s3,r1);\
q3 = _mm256_unpacklo_epi64(q3,p1);
s1 = _mm256_unpackhi_epi32( r3, r2 ); \
q1 = _mm256_unpackhi_epi32( p3, p2 ); \
s3 = _mm256_unpacklo_epi32( r3, r2 ); \
q3 = _mm256_unpacklo_epi32( p3, p2 ); \
r3 = _mm256_unpackhi_epi32( r1, r0 ); \
r1 = _mm256_unpacklo_epi32( r1, r0 ); \
p3 = _mm256_unpackhi_epi32( p1, p0 ); \
p1 = _mm256_unpacklo_epi32( p1, p0 ); \
s0 = _mm256_unpackhi_epi64( s1, r3 ); \
q0 = _mm256_unpackhi_epi64( q1 ,p3 ); \
s1 = _mm256_unpacklo_epi64( s1, r3 ); \
q1 = _mm256_unpacklo_epi64( q1, p3 ); \
s2 = _mm256_unpackhi_epi64( s3, r1 ); \
q2 = _mm256_unpackhi_epi64( q3, p1 ); \
s3 = _mm256_unpacklo_epi64( s3, r1 ); \
q3 = _mm256_unpacklo_epi64( q3, p1 );
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -656,17 +646,10 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
__m256i *chainv = state->chainv;
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
t0 = chainv[0];
t1 = chainv[1];
t0 = _mm256_xor_si256( t0, chainv[2] );
t1 = _mm256_xor_si256( t1, chainv[3] );
t0 = _mm256_xor_si256( t0, chainv[4] );
t1 = _mm256_xor_si256( t1, chainv[5] );
t0 = _mm256_xor_si256( t0, chainv[6] );
t1 = _mm256_xor_si256( t1, chainv[7] );
t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] );
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
MULT2( t0, t1 );
@@ -701,11 +684,8 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9] );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
t0 = chainv[8] = _mm256_xor_si256( chainv[8], t0 );
t1 = chainv[9] = _mm256_xor_si256( chainv[9], t1 );
MULT2( chainv[8], chainv[9] );
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
@@ -794,29 +774,22 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
{
uint32 hash[8*2] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t[2];
__m256i t0, t1;
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
/*---- blank round with m=0 ----*/
rnd512_2way( state, NULL );
t[0] = chainv[0];
t[1] = chainv[1];
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t0 = _mm256_shuffle_epi32( t0, 27 );
t1 = _mm256_shuffle_epi32( t1, 27 );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
@@ -825,22 +798,16 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
rnd512_2way( state, NULL );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
t0 = _mm256_shuffle_epi32( t0, 27 );
t1 = _mm256_shuffle_epi32( t1, 27 );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
_mm256_store_si256( (__m256i*)&hash[0], t0 );
_mm256_store_si256( (__m256i*)&hash[8], t1 );
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );

View File

@@ -22,20 +22,29 @@
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#define cns(i) ( ( (__m128i*)CNS_INIT)[i] )
#define ADD_CONSTANT( a, b, c0 ,c1 ) \
a = _mm_xor_si128( a, c0 ); \
b = _mm_xor_si128( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#define MULT2( a0, a1 ) \
{ \
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi32( a1, b, 1 ); \
a1 = _mm_alignr_epi32( b, a1, 1 ); \
__m128i b = _mm_xor_si128( a0, \
_mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
__m128i b = _mm_xor_si128( a0, \
_mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
} while(0)
@@ -44,79 +53,88 @@
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
__m128i b = _mm_xor_si128( a0, \
_mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#endif
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm_shuffle_epi32(a1,147);\
t0 = _mm_load_si128(&a1);\
a1 = _mm_unpacklo_epi32(a1,a0);\
t0 = _mm_unpackhi_epi32(t0,a0);\
t1 = _mm_shuffle_epi32(t0,78);\
a0 = _mm_shuffle_epi32(a1,78);\
SUBCRUMB(t1,t0,a0,a1,tmp0);\
t0 = _mm_unpacklo_epi32(t0,t1);\
a1 = _mm_unpacklo_epi32(a1,a0);\
a0 = _mm_load_si128(&a1);\
a0 = _mm_unpackhi_epi64(a0,t0);\
a1 = _mm_unpacklo_epi64(a1,t0);\
a1 = _mm_shuffle_epi32(a1,57);\
MIXWORD(a0,a1,tmp0,tmp1);\
ADD_CONSTANT(a0,a1,c0,c1);
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m128i t = a0; \
a0 = mm128_xoror( a3, a0, a1 ); \
a2 = _mm_xor_si128( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm128_xorand( a2, a3, t ); \
a2 = mm128_xorand( a1, a2, a0 ); \
a1 = _mm_or_si128( a1, a3 ); \
a3 = _mm_xor_si128( a3, a2 ); \
t = _mm_xor_si128( t, a1 ); \
a2 = _mm_and_si128( a2, a1 ); \
a1 = mm128_xnor( a1, a0 ); \
a0 = t; \
}
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm_load_si128(&a0);\
a0 = _mm_or_si128(a0,a1);\
a2 = _mm_xor_si128(a2,a3);\
a1 = mm128_not( a1 );\
a0 = _mm_xor_si128(a0,a3);\
a3 = _mm_and_si128(a3,t);\
a1 = _mm_xor_si128(a1,a3);\
a3 = _mm_xor_si128(a3,a2);\
a2 = _mm_and_si128(a2,a0);\
a0 = mm128_not( a0 );\
a2 = _mm_xor_si128(a2,a1);\
a1 = _mm_or_si128(a1,a3);\
t = _mm_xor_si128(t,a1);\
a3 = _mm_xor_si128(a3,a2);\
a2 = _mm_and_si128(a2,a1);\
a1 = _mm_xor_si128(a1,a0);\
a0 = _mm_load_si128(&t);\
#else
#define MIXWORD(a,b,t1,t2)\
b = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(a,2);\
t2 = _mm_srli_epi32(a,30);\
a = _mm_or_si128(t1,t2);\
a = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(b,14);\
t2 = _mm_srli_epi32(b,18);\
b = _mm_or_si128(t1,t2);\
b = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(a,10);\
t2 = _mm_srli_epi32(a,22);\
a = _mm_or_si128(t1,t2);\
a = _mm_xor_si128(a,b);\
t1 = _mm_slli_epi32(b,1);\
t2 = _mm_srli_epi32(b,31);\
b = _mm_or_si128(t1,t2);
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m128i t = a0; \
a0 = _mm_or_si128( a0, a1 ); \
a2 = _mm_xor_si128( a2, a3 ); \
a1 = mm128_not( a1 ); \
a0 = _mm_xor_si128( a0, a3 ); \
a3 = _mm_and_si128( a3, t ); \
a1 = _mm_xor_si128( a1, a3 ); \
a3 = _mm_xor_si128( a3, a2 ); \
a2 = _mm_and_si128( a2, a0 ); \
a0 = mm128_not( a0 ); \
a2 = _mm_xor_si128( a2, a1 ); \
a1 = _mm_or_si128( a1, a3 ); \
t = _mm_xor_si128( t , a1 ); \
a3 = _mm_xor_si128( a3, a2 ); \
a2 = _mm_and_si128( a2, a1 ); \
a1 = _mm_xor_si128( a1, a0 ); \
a0 = t; \
}
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm_xor_si128(a,c0);\
b = _mm_xor_si128(b,c1);\
#endif
#define MIXWORD( a, b ) \
b = _mm_xor_si128( a, b ); \
a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \
b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \
a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \
b = mm128_rol_32( b, 1 );
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
SUBCRUMB( x0, x1, x2, x3 ); \
SUBCRUMB( x5, x6, x7, x4 ); \
MIXWORD( x0, x4 ); \
MIXWORD( x1, x5 ); \
MIXWORD( x2, x6 ); \
MIXWORD( x3, x7 ); \
ADD_CONSTANT( x0, x4, c0, c1 );
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
t0 = _mm_shuffle_epi32( a1, 147 ); \
a1 = _mm_unpacklo_epi32( t0, a0 ); \
t0 = _mm_unpackhi_epi32( t0, a0 ); \
t1 = _mm_shuffle_epi32( t0, 78 ); \
a0 = _mm_shuffle_epi32( a1, 78 ); \
SUBCRUMB( t1, t0, a0, a1 ); \
t0 = _mm_unpacklo_epi32( t0, t1 ); \
a1 = _mm_unpacklo_epi32( a1, a0 ); \
a0 = _mm_unpackhi_epi64( a1, t0 ); \
a1 = _mm_unpacklo_epi64( a1, t0 ); \
a1 = _mm_shuffle_epi32( a1, 57 ); \
MIXWORD( a0, a1 ); \
ADD_CONSTANT( a0, a1, c0, c1 );
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm_load_si128(&r1);\
@@ -177,32 +195,22 @@
q1 = _mm_load_si128(&p1);\
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm_load_si128(&r3);\
q1 = _mm_load_si128(&p3);\
s3 = _mm_load_si128(&r3);\
q3 = _mm_load_si128(&p3);\
s1 = _mm_unpackhi_epi32(s1,r2);\
q1 = _mm_unpackhi_epi32(q1,p2);\
s3 = _mm_unpacklo_epi32(s3,r2);\
q3 = _mm_unpacklo_epi32(q3,p2);\
s0 = _mm_load_si128(&s1);\
q0 = _mm_load_si128(&q1);\
s2 = _mm_load_si128(&s3);\
q2 = _mm_load_si128(&q3);\
r3 = _mm_load_si128(&r1);\
p3 = _mm_load_si128(&p1);\
r1 = _mm_unpacklo_epi32(r1,r0);\
p1 = _mm_unpacklo_epi32(p1,p0);\
r3 = _mm_unpackhi_epi32(r3,r0);\
p3 = _mm_unpackhi_epi32(p3,p0);\
s0 = _mm_unpackhi_epi64(s0,r3);\
q0 = _mm_unpackhi_epi64(q0,p3);\
s1 = _mm_unpacklo_epi64(s1,r3);\
q1 = _mm_unpacklo_epi64(q1,p3);\
s2 = _mm_unpackhi_epi64(s2,r1);\
q2 = _mm_unpackhi_epi64(q2,p1);\
s3 = _mm_unpacklo_epi64(s3,r1);\
q3 = _mm_unpacklo_epi64(q3,p1);
s1 = _mm_unpackhi_epi32( r3, r2 ); \
q1 = _mm_unpackhi_epi32( p3, p2 ); \
s3 = _mm_unpacklo_epi32( r3, r2 ); \
q3 = _mm_unpacklo_epi32( p3, p2 ); \
r3 = _mm_unpackhi_epi32( r1, r0 ); \
r1 = _mm_unpacklo_epi32( r1, r0 ); \
p3 = _mm_unpackhi_epi32( p1, p0 ); \
p1 = _mm_unpacklo_epi32( p1, p0 ); \
s0 = _mm_unpackhi_epi64( s1, r3 ); \
q0 = _mm_unpackhi_epi64( q1 ,p3 ); \
s1 = _mm_unpacklo_epi64( s1, r3 ); \
q1 = _mm_unpacklo_epi64( q1, p3 ); \
s2 = _mm_unpackhi_epi64( s3, r1 ); \
q2 = _mm_unpackhi_epi64( q3, p1 ); \
s3 = _mm_unpacklo_epi64( s3, r1 ); \
q3 = _mm_unpacklo_epi64( q3, p1 );
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -306,8 +314,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
casti_m128i( state->buffer, 1 ) = _mm_set_epi32( 0, 0, 0, 0x80000000 );
}
return SUCCESS;
@@ -325,8 +332,7 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
else
{
// empty pad block, constant data
rnd512( state, _mm_setzero_si128(),
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) );
}
finalization512(state, (uint32*) hashval);
@@ -423,163 +429,119 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
{
__m128i t[2];
__m128i t0, t1;
__m128i *chainv = state->chainv;
__m128i tmp[2];
__m128i x[8];
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
t[0] = chainv[0];
t[1] = chainv[1];
t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm128_xor3( t0, chainv[6], chainv[8] );
t1 = mm128_xor3( t1, chainv[7], chainv[9] );
t[0] = _mm_xor_si128( t[0], chainv[2] );
t[1] = _mm_xor_si128( t[1], chainv[3] );
t[0] = _mm_xor_si128( t[0], chainv[4] );
t[1] = _mm_xor_si128( t[1], chainv[5] );
t[0] = _mm_xor_si128( t[0], chainv[6] );
t[1] = _mm_xor_si128( t[1], chainv[7] );
t[0] = _mm_xor_si128( t[0], chainv[8] );
t[1] = _mm_xor_si128( t[1], chainv[9] );
MULT2( t[0], t[1] );
MULT2( t0, t1 );
msg0 = _mm_shuffle_epi32( msg0, 27 );
msg1 = _mm_shuffle_epi32( msg1, 27 );
chainv[0] = _mm_xor_si128( chainv[0], t[0] );
chainv[1] = _mm_xor_si128( chainv[1], t[1] );
chainv[2] = _mm_xor_si128( chainv[2], t[0] );
chainv[3] = _mm_xor_si128( chainv[3], t[1] );
chainv[4] = _mm_xor_si128( chainv[4], t[0] );
chainv[5] = _mm_xor_si128( chainv[5], t[1] );
chainv[6] = _mm_xor_si128( chainv[6], t[0] );
chainv[7] = _mm_xor_si128( chainv[7], t[1] );
chainv[8] = _mm_xor_si128( chainv[8], t[0] );
chainv[9] = _mm_xor_si128( chainv[9], t[1] );
chainv[0] = _mm_xor_si128( chainv[0], t0 );
chainv[1] = _mm_xor_si128( chainv[1], t1 );
chainv[2] = _mm_xor_si128( chainv[2], t0 );
chainv[3] = _mm_xor_si128( chainv[3], t1 );
chainv[4] = _mm_xor_si128( chainv[4], t0 );
chainv[5] = _mm_xor_si128( chainv[5], t1 );
chainv[6] = _mm_xor_si128( chainv[6], t0 );
chainv[7] = _mm_xor_si128( chainv[7], t1 );
chainv[8] = _mm_xor_si128( chainv[8], t0 );
chainv[9] = _mm_xor_si128( chainv[9], t1 );
t[0] = chainv[0];
t[1] = chainv[1];
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm_xor_si128( chainv[0], chainv[2] );
chainv[1] = _mm_xor_si128( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
chainv[2] = _mm_xor_si128(chainv[2], chainv[4]);
chainv[3] = _mm_xor_si128(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm_xor_si128(chainv[4], chainv[6]);
chainv[5] = _mm_xor_si128(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm_xor_si128(chainv[6], chainv[8]);
chainv[7] = _mm_xor_si128(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm_xor_si128( chainv[8], t[0] );
chainv[9] = _mm_xor_si128( chainv[9], t[1] );
t[0] = chainv[8];
t[1] = chainv[9];
t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 );
t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 );
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm_xor_si128( chainv[8], chainv[6] );
chainv[9] = _mm_xor_si128( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm_xor_si128( chainv[6], chainv[4] );
chainv[7] = _mm_xor_si128( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm_xor_si128( chainv[4], chainv[2] );
chainv[5] = _mm_xor_si128( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm_xor_si128( chainv[2], chainv[0] );
chainv[3] = _mm_xor_si128( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg0 );
chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg1 );
chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 );
chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm_xor_si128( chainv[2], msg0 );
chainv[3] = _mm_xor_si128( chainv[3], msg1 );
MULT2( msg0, msg1);
chainv[4] = _mm_xor_si128( chainv[4], msg0 );
chainv[5] = _mm_xor_si128( chainv[5], msg1 );
MULT2( msg0, msg1);
chainv[6] = _mm_xor_si128( chainv[6], msg0 );
chainv[7] = _mm_xor_si128( chainv[7], msg1 );
MULT2( msg0, msg1);
chainv[8] = _mm_xor_si128( chainv[8], msg0 );
chainv[9] = _mm_xor_si128( chainv[9], msg1 );
MULT2( msg0, msg1);
chainv[3] = mm128_rol_32( chainv[3], 1 );
chainv[5] = mm128_rol_32( chainv[5], 2 );
chainv[7] = mm128_rol_32( chainv[7], 3 );
chainv[9] = mm128_rol_32( chainv[9], 4 );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
_mm_srli_epi32(chainv[3], 31) );
chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2),
_mm_srli_epi32(chainv[5], 30) );
chainv[7] = _mm_or_si128( _mm_slli_epi32(chainv[7], 3),
_mm_srli_epi32(chainv[7], 29) );
chainv[9] = _mm_or_si128( _mm_slli_epi32(chainv[9], 4),
_mm_srli_epi32(chainv[9], 28) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );
MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS128[ 0], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 2], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 4], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 6], &tmp[0] );
STEP_PART( &x[0], &CNS128[ 8], &tmp[0] );
STEP_PART( &x[0], &CNS128[10], &tmp[0] );
STEP_PART( &x[0], &CNS128[12], &tmp[0] );
STEP_PART( &x[0], &CNS128[14], &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[16], CNS128[17],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[18], CNS128[19],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[20], CNS128[21],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[22], CNS128[23],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[24], CNS128[25],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[26], CNS128[27],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[28], CNS128[29],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[30], CNS128[31],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
}
@@ -588,51 +550,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
/* state: hash context */
/* b[8]: hash values */
#if defined (__AVX2__)
static void finalization512( hashState_luffa *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = (__m256i*)state->chainv;
__m256i t;
const __m128i zero = m128_zero;
const __m256i shuff_bswap32 = _mm256_set_epi64x( 0x1c1d1e1f18191a1b,
0x1415161710111213,
0x0c0d0e0f08090a0b,
0x0405060700010203 );
rnd512( state, zero, zero );
t = chainv[0];
t = _mm256_xor_si256( t, chainv[1] );
t = _mm256_xor_si256( t, chainv[2] );
t = _mm256_xor_si256( t, chainv[3] );
t = _mm256_xor_si256( t, chainv[4] );
t = _mm256_shuffle_epi32( t, 27 );
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
rnd512( state, zero, zero );
t = chainv[0];
t = _mm256_xor_si256( t, chainv[1] );
t = _mm256_xor_si256( t, chainv[2] );
t = _mm256_xor_si256( t, chainv[3] );
t = _mm256_xor_si256( t, chainv[4] );
t = _mm256_shuffle_epi32( t, 27 );
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
}
#else
static void finalization512( hashState_luffa *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
@@ -685,6 +602,5 @@ static void finalization512( hashState_luffa *state, uint32 *b )
casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
}
#endif
/***************************************************/

View File

@@ -212,7 +212,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -398,7 +398,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );

View File

@@ -203,7 +203,7 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
*noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -345,7 +345,7 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -287,7 +287,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -389,7 +389,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
*noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
pdata[19] = n;

View File

@@ -103,7 +103,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = m512_const1_32( 16 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -213,7 +213,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = m256_const1_32( 8 );
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
@@ -328,7 +328,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
*noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
n += 4;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

View File

@@ -62,10 +62,10 @@ inline void initState( uint64_t State[/*16*/] )
state[1] = zero;
state[2] = zero;
state[3] = zero;
state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
state[4] = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state[5] = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state[6] = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state[7] = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
#else
//First 512 bis are zeros
@@ -299,10 +299,10 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
state1 =
state2 =
state3 = m128_zero;
state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
state4 = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
state5 = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
state6 = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
state7 = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
for ( int i = 0; i < nBlocks; i++ )
{

View File

@@ -43,27 +43,29 @@ static const uint64_t blake2b_IV[8] =
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
/*Blake2b's rotation*/
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
// serial data is only 32 bytes so AVX2 is the limit for that dimension.
// However, 2 way parallel looks trivial to code for AVX512 except for
// a data dependency with rowa.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define G2W_4X64(a,b,c,d) \
a = _mm512_add_epi64( a, b ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 24 ); \
a = _mm512_add_epi64( a, b ); \
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 16 ); \
c = _mm512_add_epi64( c, d ); \
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s0 = mm512_shufll256_64( s0 ); \
s3 = mm512_swap256_128( s3); \
s2 = mm512_shuflr256_64( s2 ); \
G2W_4X64( s0, s1, s2, s3 ); \
s0 = mm512_shuflr256_64( s0 ); \
s3 = mm512_swap256_128( s3 ); \
s2 = mm512_shufll256_64( s2 );
/*
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
G2W_4X64( s0, s1, s2, s3 ); \
s3 = mm512_shufll256_64( s3 ); \
@@ -73,6 +75,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
s3 = mm512_shuflr256_64( s3 ); \
s1 = mm512_shufll256_64( s1 ); \
s2 = mm512_swap256_128( s2 );
*/
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -88,13 +91,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
#endif // AVX512
#if defined __AVX2__
#if defined(__AVX2__)
// process 4 columns in parallel
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
@@ -105,6 +105,18 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
// Pivot about s1 instead of s0 reduces latency.
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s0 = mm256_shufll_64( s0 ); \
s3 = mm256_swap_128( s3); \
s2 = mm256_shuflr_64( s2 ); \
G_4X64( s0, s1, s2, s3 ); \
s0 = mm256_shuflr_64( s0 ); \
s3 = mm256_swap_128( s3 ); \
s2 = mm256_shufll_64( s2 );
/*
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s3 = mm256_shufll_64( s3 ); \
@@ -114,6 +126,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
s3 = mm256_shuflr_64( s3 ); \
s1 = mm256_shufll_64( s1 ); \
s2 = mm256_swap_128( s2 );
*/
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -182,8 +195,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#endif // AVX2 else SSE2
// Scalar
//Blake2b's G function
/*
// Scalar, not used.
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
#define G(r,i,a,b,c,d) \
do { \
a = a + b; \
@@ -196,8 +214,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
b = rotr64(b ^ c, 63); \
} while(0)
/*One Round of the Blake2b's compression function*/
#define ROUND_LYRA(r) \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -207,6 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -51,7 +51,7 @@ void anime_8way_hash( void *state, const void *input )
__m512i* vhA = (__m512i*)vhashA;
__m512i* vhB = (__m512i*)vhashB;
__m512i* vhC = (__m512i*)vhashC;
const __m512i bit3_mask = m512_const1_64( 8 );
const __m512i bit3_mask = _mm512_set1_epi64( 8 );
__mmask8 vh_mask;
anime_8way_context_overlay ctx __attribute__ ((aligned (64)));
@@ -209,7 +209,7 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -248,7 +248,7 @@ void anime_4way_hash( void *state, const void *input )
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
int h_mask;
const __m256i bit3_mask = m256_const1_64( 8 );
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i zero = _mm256_setzero_si256();
anime_4way_context_overlay ctx __attribute__ ((aligned (64)));
@@ -388,7 +388,7 @@ int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -75,7 +75,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
uint32_t hash7 [16] __attribute__ ((aligned (32)));
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
__mmask8 vh_mask;
const __m512i vmask = m512_const1_64( 24 );
const __m512i vmask = _mm512_set1_epi64( 24 );
const uint32_t mask = 24;
__m512i* vh = (__m512i*)vhash;
__m512i* vhA = (__m512i*)vhashA;
@@ -593,7 +593,7 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
@@ -647,7 +647,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
__m256i vh_mask;
int h_mask;
const __m256i vmask = m256_const1_64( 24 );
const __m256i vmask = _mm256_set1_epi64x( 24 );
const uint32_t mask = 24;
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
@@ -1041,7 +1041,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -67,7 +67,7 @@ void quark_8way_hash( void *state, const void *input )
__mmask8 vh_mask;
quark_8way_ctx_holder ctx;
const uint32_t mask = 8;
const __m512i bit3_mask = m512_const1_64( mask );
const __m512i bit3_mask = _mm512_set1_epi64( mask );
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
@@ -224,7 +224,7 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
@@ -271,7 +271,7 @@ void quark_4way_hash( void *state, const void *input )
__m256i vh_mask;
int h_mask;
quark_4way_ctx_holder ctx;
const __m256i bit3_mask = m256_const1_64( 8 );
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i zero = _mm256_setzero_si256();
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
@@ -397,7 +397,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

View File

@@ -47,7 +47,7 @@ static const uint32_t IV[5] =
do{ \
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
m128_const1_64( k ) ), s ), e ); \
_mm_set1_epi64x( k ) ), s ), e ); \
c = mm128_rol_32( c, 10 );\
} while (0)
@@ -251,11 +251,11 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )
void ripemd160_4way_init( ripemd160_4way_context *sc )
{
sc->val[0] = m128_const1_64( 0x6745230167452301 );
sc->val[1] = m128_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m128_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m128_const1_64( 0x1032547610325476 );
sc->val[4] = m128_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
sc->val[2] = _mm_set1_epi64x( 0x98BADCFE98BADCFE );
sc->val[3] = _mm_set1_epi64x( 0x1032547610325476 );
sc->val[4] = _mm_set1_epi64x( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
@@ -347,7 +347,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
do{ \
a = _mm256_add_epi32( mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( \
_mm256_add_epi32( a, f( b ,c, d ) ), r ), \
m256_const1_64( k ) ), s ), e ); \
_mm256_set1_epi64x( k ) ), s ), e ); \
c = mm256_rol_32( c, 10 );\
} while (0)
@@ -552,11 +552,11 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )
void ripemd160_8way_init( ripemd160_8way_context *sc )
{
sc->val[0] = m256_const1_64( 0x6745230167452301 );
sc->val[1] = m256_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m256_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m256_const1_64( 0x1032547610325476 );
sc->val[4] = m256_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
sc->val[2] = _mm256_set1_epi64x( 0x98BADCFE98BADCFE );
sc->val[3] = _mm256_set1_epi64x( 0x1032547610325476 );
sc->val[4] = _mm256_set1_epi64x( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
@@ -649,7 +649,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst )
do{ \
a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
_mm512_add_epi32( a, f( b ,c, d ) ), r ), \
m512_const1_64( k ) ), s ), e ); \
_mm512_set1_epi64( k ) ), s ), e ); \
c = mm512_rol_32( c, 10 );\
} while (0)
@@ -853,11 +853,11 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc )
void ripemd160_16way_init( ripemd160_16way_context *sc )
{
sc->val[0] = m512_const1_64( 0x6745230167452301 );
sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
sc->val[3] = m512_const1_64( 0x1032547610325476 );
sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
sc->val[2] = _mm512_set1_epi64( 0x98BADCFE98BADCFE );
sc->val[3] = _mm512_set1_epi64( 0x1032547610325476 );
sc->val[4] = _mm512_set1_epi64( 0xC3D2E1F0C3D2E1F0 );
sc->count_high = sc->count_low = 0;
}
@@ -902,7 +902,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst )
const int pad = block_size - 8;
ptr = (unsigned)sc->count_low & ( block_size - 1U);
sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
sc->buf[ ptr>>2 ] = _mm512_set1_epi32( 0x80 );
ptr += 4;
if ( ptr > pad )

View File

@@ -311,7 +311,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
__m128i A, B, C, D, E, F, G, H;
__m128i W[16]; memcpy_128( W, data, 16 );
// Value required by H after round 60 to produce valid final hash
const __m128i H_ = m128_const1_32( 0x136032ED );
const __m128i H_ = _mm_set1_epi32( 0x136032ED );
A = _mm_load_si128( state_in );
B = _mm_load_si128( state_in+1 );
@@ -408,14 +408,14 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
void sha256_4way_init( sha256_4way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 );
sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A );
sc->val[4] = m128_const1_64( 0x510E527F510E527F );
sc->val[5] = m128_const1_64( 0x9B05688C9B05688C );
sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
}
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -458,7 +458,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -474,8 +474,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = m128_const1_32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
@@ -589,7 +589,6 @@ do { \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
#define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -863,7 +862,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
{
__m256i A, B, C, D, E, F, G, H;
__m256i W[16]; memcpy_256( W, data, 16 );
const __m256i H_ = m256_const1_32( 0x136032ED );
const __m256i H_ = _mm256_set1_epi32( 0x136032ED );
A = _mm256_load_si256( state_in );
B = _mm256_load_si256( state_in+1 );
@@ -979,14 +978,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
void sha256_8way_init( sha256_8way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
sc->val[4] = m256_const1_64( 0x510E527F510E527F );
sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
}
// need to handle odd byte length for yespower.
@@ -1032,7 +1031,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1048,8 +1047,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = m256_const1_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
@@ -1360,7 +1359,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
// Value for H at round 60, before adding K, needed to produce valid final
// hash where H == 0.
// H_ = -( H256[7] + K256[60] );
const __m512i H_ = m512_const1_32( 0x136032ED );
const __m512i H_ = _mm512_set1_epi32( 0x136032ED );
A = _mm512_load_si512( state_in );
B = _mm512_load_si512( state_in+1 );
@@ -1453,14 +1452,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
void sha256_16way_init( sha256_16way_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 );
sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A );
sc->val[4] = m512_const1_64( 0x510E527F510E527F );
sc->val[5] = m512_const1_64( 0x9B05688C9B05688C );
sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
}
void sha256_16way_update( sha256_16way_context *sc, const void *data,
@@ -1504,7 +1503,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
const int pad = buf_size - 8;
ptr = (unsigned)sc->count_low & (buf_size - 1U);
sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
ptr += 4;
if ( ptr > pad )
@@ -1520,8 +1519,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad >> 2 ] = m512_const1_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
sc->buf[ pad >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );

View File

@@ -28,32 +28,32 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
const __m512i sixteen = m512_const1_32( 16 );
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
vdata[i] = _mm512_set1_epi32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m512_const1_64( 0x510E527F510E527F );
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sha256_16way_transform_le( midstate1, vdata, initstate );
@@ -116,31 +116,31 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = m256_const1_32( 0x80000000 );
const __m256i eight = m256_const1_32( 8 );
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = _mm256_set1_epi32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m256_const1_64( 0x510E527F510E527F );
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sha256_8way_transform_le( midstate1, vdata, initstate );
@@ -204,31 +204,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
vdata[i] = _mm_set1_epi32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );

View File

@@ -68,7 +68,7 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
*noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
@@ -140,7 +140,7 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;

View File

@@ -28,31 +28,31 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
const __m512i sixteen = m512_const1_32( 16 );
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m512_const1_32( pdata[i] );
vdata[i] = _mm512_set1_epi32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = m512_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = m512_const1_32( 32*8 ); // bit count
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m512_const1_64( 0x510E527F510E527F );
initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sha256_16way_transform_le( midstate1, vdata, initstate );
@@ -120,31 +120,31 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = m256_const1_32( 0x80000000 );
const __m256i eight = m256_const1_32( 8 );
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m256_const1_32( pdata[i] );
vdata[i] = _mm256_set1_epi32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = m256_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = m256_const1_32( 32*8 ); // bit count
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m256_const1_64( 0x510E527F510E527F );
initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sha256_8way_transform_le( midstate1, vdata, initstate );
@@ -215,31 +215,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
vdata[i] = _mm_set1_epi32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );
@@ -302,31 +302,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = m128_const1_32( pdata[i] );
vdata[i] = _mm_set1_epi32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = m128_const1_32( 80*8 ); // bit count
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = m128_const1_32( 32*8 ); // bit count
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
initstate[4] = m128_const1_64( 0x510E527F510E527F );
initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate, vdata, initstate );

View File

@@ -243,7 +243,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
ptr += 8;
if ( ptr > pad )
{
@@ -268,51 +268,56 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
// SHA-512 4 way 64 bit
#define BSG5_0( x ) mm256_xor3( mm256_ror_64( x, 28 ), \
mm256_ror_64( x, 34 ), \
mm256_ror_64( x, 39 ) )
#define BSG5_1( x ) mm256_xor3( mm256_ror_64( x, 14 ), \
mm256_ror_64( x, 18 ), \
mm256_ror_64( x, 41 ) )
#define SSG5_0( x ) mm256_xor3( mm256_ror_64( x, 1 ), \
mm256_ror_64( x, 8 ), \
_mm256_srli_epi64( x, 7 ) )
#define SSG5_1( x ) mm256_xor3( mm256_ror_64( x, 19 ), \
mm256_ror_64( x, 61 ), \
_mm256_srli_epi64( x, 6 ) )
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
// becomes available.
#define CH( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xca )
#define MAJ( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 )
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
T1 = _mm256_add_epi64( T1, H ); \
T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
T1 = _mm256_add_epi64( T1, T0 ); \
D = _mm256_add_epi64( D, T1 ); \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
#else // AVX2 only
#define CH(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJ(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
#define BSG5_0(x) \
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
_mm256_xor_si256( mm256_ror_64( x, 5 ), x ), 6 ), x ), 28 )
#define BSG5_1(x) \
mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
_mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
/*
#define SSG5_0(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) )
#define SSG5_1(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
*/
// Interleave SSG0 & SSG1 for better throughput.
// return ssg0(w0) + ssg1(w1)
static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
{
__m256i w0a, w1a, w0b, w1b;
w0a = mm256_ror_64( w0, 1 );
w1a = mm256_ror_64( w1,19 );
w0b = mm256_ror_64( w0, 8 );
w1b = mm256_ror_64( w1,61 );
w0a = _mm256_xor_si256( w0a, w0b );
w1a = _mm256_xor_si256( w1a, w1b );
w0b = _mm256_srli_epi64( w0, 7 );
w1b = _mm256_srli_epi64( w1, 6 );
w0a = _mm256_xor_si256( w0a, w0b );
w1a = _mm256_xor_si256( w1a, w1b );
return _mm256_add_epi64( w0a, w1a );
}
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
__m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
__m256i T1 = BSG5_1( E ); \
__m256i T2 = BSG5_0( A ); \
T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -324,19 +329,27 @@ do { \
H = _mm256_add_epi64( T1, T2 ); \
} while (0)
#endif // AVX512VL AVX10_256
static void
sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
{
int i;
register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
register __m256i A, B, C, D, E, F, G, H;
#if !defined(__AVX512VL__)
// Disable for AVX10_256
__m256i X_xor_Y, Y_xor_Z;
#endif
__m256i W[80];
mm256_block_bswap_64( W , in );
mm256_block_bswap_64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = _mm256_add_epi64( ssg512_add( W[i-15], W[i-2] ),
_mm256_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
W[i] = mm256_add4_64( SSG5_0( W[i-15] ), SSG5_1( W[i-2] ),
W[ i- 7 ], W[ i-16 ] );
if ( ctx->initialized )
{
@@ -351,17 +364,20 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
}
else
{
A = m256_const1_64( 0x6A09E667F3BCC908 );
B = m256_const1_64( 0xBB67AE8584CAA73B );
C = m256_const1_64( 0x3C6EF372FE94F82B );
D = m256_const1_64( 0xA54FF53A5F1D36F1 );
E = m256_const1_64( 0x510E527FADE682D1 );
F = m256_const1_64( 0x9B05688C2B3E6C1F );
G = m256_const1_64( 0x1F83D9ABFB41BD6B );
H = m256_const1_64( 0x5BE0CD19137E2179 );
A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
}
#if !defined(__AVX512VL__)
// Disable for AVX10_256
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
for ( i = 0; i < 80; i += 8 )
{
@@ -389,14 +405,14 @@ sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] )
else
{
ctx->initialized = true;
r[0] = _mm256_add_epi64( A, m256_const1_64( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, m256_const1_64( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, m256_const1_64( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, m256_const1_64( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, m256_const1_64( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, m256_const1_64( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, m256_const1_64( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, m256_const1_64( 0x5BE0CD19137E2179 ) );
r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
}
}
@@ -441,7 +457,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
ptr += 8;
if ( ptr > pad )
{

View File

@@ -112,50 +112,50 @@ extern "C"{
else \
{ \
(state)->state_loaded = true; \
A0 = m256_const1_64( 0x20728DFD20728DFD ); \
A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m256_const1_64( 0xE782B699E782B699 ); \
A3 = m256_const1_64( 0x5530463255304632 ); \
A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m256_const1_64( 0x8BD144108BD14410 ); \
AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m256_const1_64( 0x07B385F307B385F3 ); \
B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
B8 = m256_const1_64( 0x48910A5A48910A5A ); \
B9 = m256_const1_64( 0x893B22DB893B22DB ); \
BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
BC = m256_const1_64( 0x72D2F24072D2F240 ); \
BD = m256_const1_64( 0x75941D9975941D99 ); \
BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
C2 = m256_const1_64( 0x56028CB256028CB2 ); \
C3 = m256_const1_64( 0x8134F3598134F359 ); \
C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
C7 = m256_const1_64( 0x0405278004052780 ); \
C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
C9 = m256_const1_64( 0x5194358F5194358F ); \
CA = m256_const1_64( 0x3C60D6653C60D665 ); \
CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
CC = m256_const1_64( 0x950C3434950C3434 ); \
CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
A0 = _mm256_set1_epi64x( 0x20728DFD20728DFD ); \
A1 = _mm256_set1_epi64x( 0x46C0BD5346C0BD53 ); \
A2 = _mm256_set1_epi64x( 0xE782B699E782B699 ); \
A3 = _mm256_set1_epi64x( 0x5530463255304632 ); \
A4 = _mm256_set1_epi64x( 0x71B4EF9071B4EF90 ); \
A5 = _mm256_set1_epi64x( 0x0EA9E82C0EA9E82C ); \
A6 = _mm256_set1_epi64x( 0xDBB930F1DBB930F1 ); \
A7 = _mm256_set1_epi64x( 0xFAD06B8BFAD06B8B ); \
A8 = _mm256_set1_epi64x( 0xBE0CAE40BE0CAE40 ); \
A9 = _mm256_set1_epi64x( 0x8BD144108BD14410 ); \
AA = _mm256_set1_epi64x( 0x76D2ADAC76D2ADAC ); \
AB = _mm256_set1_epi64x( 0x28ACAB7F28ACAB7F ); \
B0 = _mm256_set1_epi64x( 0xC1099CB7C1099CB7 ); \
B1 = _mm256_set1_epi64x( 0x07B385F307B385F3 ); \
B2 = _mm256_set1_epi64x( 0xE7442C26E7442C26 ); \
B3 = _mm256_set1_epi64x( 0xCC8AD640CC8AD640 ); \
B4 = _mm256_set1_epi64x( 0xEB6F56C7EB6F56C7 ); \
B5 = _mm256_set1_epi64x( 0x1EA81AA91EA81AA9 ); \
B6 = _mm256_set1_epi64x( 0x73B9D31473B9D314 ); \
B7 = _mm256_set1_epi64x( 0x1DE85D081DE85D08 ); \
B8 = _mm256_set1_epi64x( 0x48910A5A48910A5A ); \
B9 = _mm256_set1_epi64x( 0x893B22DB893B22DB ); \
BA = _mm256_set1_epi64x( 0xC5A0DF44C5A0DF44 ); \
BB = _mm256_set1_epi64x( 0xBBC4324EBBC4324E ); \
BC = _mm256_set1_epi64x( 0x72D2F24072D2F240 ); \
BD = _mm256_set1_epi64x( 0x75941D9975941D99 ); \
BE = _mm256_set1_epi64x( 0x6D8BDE826D8BDE82 ); \
BF = _mm256_set1_epi64x( 0xA1A7502BA1A7502B ); \
C0 = _mm256_set1_epi64x( 0xD9BF68D1D9BF68D1 ); \
C1 = _mm256_set1_epi64x( 0x58BAD75058BAD750 ); \
C2 = _mm256_set1_epi64x( 0x56028CB256028CB2 ); \
C3 = _mm256_set1_epi64x( 0x8134F3598134F359 ); \
C4 = _mm256_set1_epi64x( 0xB5D469D8B5D469D8 ); \
C5 = _mm256_set1_epi64x( 0x941A8CC2941A8CC2 ); \
C6 = _mm256_set1_epi64x( 0x418B2A6E418B2A6E ); \
C7 = _mm256_set1_epi64x( 0x0405278004052780 ); \
C8 = _mm256_set1_epi64x( 0x7F07D7877F07D787 ); \
C9 = _mm256_set1_epi64x( 0x5194358F5194358F ); \
CA = _mm256_set1_epi64x( 0x3C60D6653C60D665 ); \
CB = _mm256_set1_epi64x( 0xBE97D79ABE97D79A ); \
CC = _mm256_set1_epi64x( 0x950C3434950C3434 ); \
CD = _mm256_set1_epi64x( 0xAED9A06DAED9A06D ); \
CE = _mm256_set1_epi64x( 0x2537DC8D2537DC8D ); \
CF = _mm256_set1_epi64x( 0x7CDB59697CDB5969 ); \
} \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
@@ -303,7 +303,7 @@ do { \
#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
_mm256_mullo_epi32( mm256_xor3( xa0, xc, \
_mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
@@ -443,52 +443,52 @@ shabal_8way_init( void *cc, unsigned size )
else
{ // No users
sc->state_loaded = true;
sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
sc->A[ 0] = _mm256_set1_epi64x( 0x52F8455252F84552 );
sc->A[ 1] = _mm256_set1_epi64x( 0xE54B7999E54B7999 );
sc->A[ 2] = _mm256_set1_epi64x( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = _mm256_set1_epi64x( 0xB9645191B9645191 );
sc->A[ 4] = _mm256_set1_epi64x( 0xE0078B86E0078B86 );
sc->A[ 5] = _mm256_set1_epi64x( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = _mm256_set1_epi64x( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = _mm256_set1_epi64x( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = _mm256_set1_epi64x( 0x14CE5A4514CE5A45 );
sc->A[ 9] = _mm256_set1_epi64x( 0x22AF50DC22AF50DC );
sc->A[10] = _mm256_set1_epi64x( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = _mm256_set1_epi64x( 0xEB21B74AEB21B74A );
sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
sc->B[ 0] = _mm256_set1_epi64x( 0xB555C6EEB555C6EE );
sc->B[ 1] = _mm256_set1_epi64x( 0x3E7105963E710596 );
sc->B[ 2] = _mm256_set1_epi64x( 0xA72A652FA72A652F );
sc->B[ 3] = _mm256_set1_epi64x( 0x9301515F9301515F );
sc->B[ 4] = _mm256_set1_epi64x( 0xDA28C1FADA28C1FA );
sc->B[ 5] = _mm256_set1_epi64x( 0x696FD868696FD868 );
sc->B[ 6] = _mm256_set1_epi64x( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = _mm256_set1_epi64x( 0x0AFE40020AFE4002 );
sc->B[ 8] = _mm256_set1_epi64x( 0xA6E03615A6E03615 );
sc->B[ 9] = _mm256_set1_epi64x( 0x5138C1D45138C1D4 );
sc->B[10] = _mm256_set1_epi64x( 0xBE216306BE216306 );
sc->B[11] = _mm256_set1_epi64x( 0xB38B8890B38B8890 );
sc->B[12] = _mm256_set1_epi64x( 0x3EA8B96B3EA8B96B );
sc->B[13] = _mm256_set1_epi64x( 0x3299ACE43299ACE4 );
sc->B[14] = _mm256_set1_epi64x( 0x30924DD430924DD4 );
sc->B[15] = _mm256_set1_epi64x( 0x55CB34A555CB34A5 );
sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
sc->C[ 0] = _mm256_set1_epi64x( 0xB405F031B405F031 );
sc->C[ 1] = _mm256_set1_epi64x( 0xC4233EBAC4233EBA );
sc->C[ 2] = _mm256_set1_epi64x( 0xB3733979B3733979 );
sc->C[ 3] = _mm256_set1_epi64x( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = _mm256_set1_epi64x( 0xC51C28AEC51C28AE );
sc->C[ 5] = _mm256_set1_epi64x( 0xA327B8E1A327B8E1 );
sc->C[ 6] = _mm256_set1_epi64x( 0x56C5616756C56167 );
sc->C[ 7] = _mm256_set1_epi64x( 0xED614433ED614433 );
sc->C[ 8] = _mm256_set1_epi64x( 0x88B59D6088B59D60 );
sc->C[ 9] = _mm256_set1_epi64x( 0x60E2CEBA60E2CEBA );
sc->C[10] = _mm256_set1_epi64x( 0x758B4B8B758B4B8B );
sc->C[11] = _mm256_set1_epi64x( 0x83E82A7F83E82A7F );
sc->C[12] = _mm256_set1_epi64x( 0xBC968828BC968828 );
sc->C[13] = _mm256_set1_epi64x( 0xE6E00BF7E6E00BF7 );
sc->C[14] = _mm256_set1_epi64x( 0xBA839E55BA839E55 );
sc->C[15] = _mm256_set1_epi64x( 0x9B491C609B491C60 );
}
sc->Wlow = 1;
sc->Whigh = 0;
@@ -707,50 +707,50 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
else \
{ \
(state)->state_loaded = true; \
A0 = m128_const1_64( 0x20728DFD20728DFD ); \
A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
A2 = m128_const1_64( 0xE782B699E782B699 ); \
A3 = m128_const1_64( 0x5530463255304632 ); \
A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
A9 = m128_const1_64( 0x8BD144108BD14410 ); \
AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
B1 = m128_const1_64( 0x07B385F307B385F3 ); \
B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
B3 = m128_const1_64( 0xCC8AD640CC8AD640 ); \
B4 = m128_const1_64( 0xEB6F56C7EB6F56C7 ); \
B5 = m128_const1_64( 0x1EA81AA91EA81AA9 ); \
B6 = m128_const1_64( 0x73B9D31473B9D314 ); \
B7 = m128_const1_64( 0x1DE85D081DE85D08 ); \
B8 = m128_const1_64( 0x48910A5A48910A5A ); \
B9 = m128_const1_64( 0x893B22DB893B22DB ); \
BA = m128_const1_64( 0xC5A0DF44C5A0DF44 ); \
BB = m128_const1_64( 0xBBC4324EBBC4324E ); \
BC = m128_const1_64( 0x72D2F24072D2F240 ); \
BD = m128_const1_64( 0x75941D9975941D99 ); \
BE = m128_const1_64( 0x6D8BDE826D8BDE82 ); \
BF = m128_const1_64( 0xA1A7502BA1A7502B ); \
C0 = m128_const1_64( 0xD9BF68D1D9BF68D1 ); \
C1 = m128_const1_64( 0x58BAD75058BAD750 ); \
C2 = m128_const1_64( 0x56028CB256028CB2 ); \
C3 = m128_const1_64( 0x8134F3598134F359 ); \
C4 = m128_const1_64( 0xB5D469D8B5D469D8 ); \
C5 = m128_const1_64( 0x941A8CC2941A8CC2 ); \
C6 = m128_const1_64( 0x418B2A6E418B2A6E ); \
C7 = m128_const1_64( 0x0405278004052780 ); \
C8 = m128_const1_64( 0x7F07D7877F07D787 ); \
C9 = m128_const1_64( 0x5194358F5194358F ); \
CA = m128_const1_64( 0x3C60D6653C60D665 ); \
CB = m128_const1_64( 0xBE97D79ABE97D79A ); \
CC = m128_const1_64( 0x950C3434950C3434 ); \
CD = m128_const1_64( 0xAED9A06DAED9A06D ); \
CE = m128_const1_64( 0x2537DC8D2537DC8D ); \
CF = m128_const1_64( 0x7CDB59697CDB5969 ); \
A0 = _mm_set1_epi64x( 0x20728DFD20728DFD ); \
A1 = _mm_set1_epi64x( 0x46C0BD5346C0BD53 ); \
A2 = _mm_set1_epi64x( 0xE782B699E782B699 ); \
A3 = _mm_set1_epi64x( 0x5530463255304632 ); \
A4 = _mm_set1_epi64x( 0x71B4EF9071B4EF90 ); \
A5 = _mm_set1_epi64x( 0x0EA9E82C0EA9E82C ); \
A6 = _mm_set1_epi64x( 0xDBB930F1DBB930F1 ); \
A7 = _mm_set1_epi64x( 0xFAD06B8BFAD06B8B ); \
A8 = _mm_set1_epi64x( 0xBE0CAE40BE0CAE40 ); \
A9 = _mm_set1_epi64x( 0x8BD144108BD14410 ); \
AA = _mm_set1_epi64x( 0x76D2ADAC76D2ADAC ); \
AB = _mm_set1_epi64x( 0x28ACAB7F28ACAB7F ); \
B0 = _mm_set1_epi64x( 0xC1099CB7C1099CB7 ); \
B1 = _mm_set1_epi64x( 0x07B385F307B385F3 ); \
B2 = _mm_set1_epi64x( 0xE7442C26E7442C26 ); \
B3 = _mm_set1_epi64x( 0xCC8AD640CC8AD640 ); \
B4 = _mm_set1_epi64x( 0xEB6F56C7EB6F56C7 ); \
B5 = _mm_set1_epi64x( 0x1EA81AA91EA81AA9 ); \
B6 = _mm_set1_epi64x( 0x73B9D31473B9D314 ); \
B7 = _mm_set1_epi64x( 0x1DE85D081DE85D08 ); \
B8 = _mm_set1_epi64x( 0x48910A5A48910A5A ); \
B9 = _mm_set1_epi64x( 0x893B22DB893B22DB ); \
BA = _mm_set1_epi64x( 0xC5A0DF44C5A0DF44 ); \
BB = _mm_set1_epi64x( 0xBBC4324EBBC4324E ); \
BC = _mm_set1_epi64x( 0x72D2F24072D2F240 ); \
BD = _mm_set1_epi64x( 0x75941D9975941D99 ); \
BE = _mm_set1_epi64x( 0x6D8BDE826D8BDE82 ); \
BF = _mm_set1_epi64x( 0xA1A7502BA1A7502B ); \
C0 = _mm_set1_epi64x( 0xD9BF68D1D9BF68D1 ); \
C1 = _mm_set1_epi64x( 0x58BAD75058BAD750 ); \
C2 = _mm_set1_epi64x( 0x56028CB256028CB2 ); \
C3 = _mm_set1_epi64x( 0x8134F3598134F359 ); \
C4 = _mm_set1_epi64x( 0xB5D469D8B5D469D8 ); \
C5 = _mm_set1_epi64x( 0x941A8CC2941A8CC2 ); \
C6 = _mm_set1_epi64x( 0x418B2A6E418B2A6E ); \
C7 = _mm_set1_epi64x( 0x0405278004052780 ); \
C8 = _mm_set1_epi64x( 0x7F07D7877F07D787 ); \
C9 = _mm_set1_epi64x( 0x5194358F5194358F ); \
CA = _mm_set1_epi64x( 0x3C60D6653C60D665 ); \
CB = _mm_set1_epi64x( 0xBE97D79ABE97D79A ); \
CC = _mm_set1_epi64x( 0x950C3434950C3434 ); \
CD = _mm_set1_epi64x( 0xAED9A06DAED9A06D ); \
CE = _mm_set1_epi64x( 0x2537DC8D2537DC8D ); \
CF = _mm_set1_epi64x( 0x7CDB59697CDB5969 ); \
} \
Wlow = (state)->Wlow; \
Whigh = (state)->Whigh; \
@@ -896,6 +896,16 @@ do { \
mm128_swap256_128( BF, CF ); \
} while (0)
#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm128_xor3( xm, xb1, mm128_xorandnot( \
_mm_mullo_epi32( mm128_xor3( xa0, xc, \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
xb0 = mm128_xnor( xa0, mm128_rol_32( xb0, 1 ) ); \
} while (0)
/*
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
@@ -905,6 +915,7 @@ do { \
) ), THREE ) ) ) ); \
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
} while (0)
*/
#define PERM_STEP_0 do { \
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
@@ -1078,103 +1089,103 @@ shabal_4way_init( void *cc, unsigned size )
{ // copy immediate constants directly to working registers later.
sc->state_loaded = false;
/*
sc->A[ 0] = m128_const1_64( 0x20728DFD20728DFD );
sc->A[ 1] = m128_const1_64( 0x46C0BD5346C0BD53 );
sc->A[ 2] = m128_const1_64( 0xE782B699E782B699 );
sc->A[ 3] = m128_const1_64( 0x5530463255304632 );
sc->A[ 4] = m128_const1_64( 0x71B4EF9071B4EF90 );
sc->A[ 5] = m128_const1_64( 0x0EA9E82C0EA9E82C );
sc->A[ 6] = m128_const1_64( 0xDBB930F1DBB930F1 );
sc->A[ 7] = m128_const1_64( 0xFAD06B8BFAD06B8B );
sc->A[ 8] = m128_const1_64( 0xBE0CAE40BE0CAE40 );
sc->A[ 9] = m128_const1_64( 0x8BD144108BD14410 );
sc->A[10] = m128_const1_64( 0x76D2ADAC76D2ADAC );
sc->A[11] = m128_const1_64( 0x28ACAB7F28ACAB7F );
sc->A[ 0] = _mm_set1_epi64x( 0x20728DFD20728DFD );
sc->A[ 1] = _mm_set1_epi64x( 0x46C0BD5346C0BD53 );
sc->A[ 2] = _mm_set1_epi64x( 0xE782B699E782B699 );
sc->A[ 3] = _mm_set1_epi64x( 0x5530463255304632 );
sc->A[ 4] = _mm_set1_epi64x( 0x71B4EF9071B4EF90 );
sc->A[ 5] = _mm_set1_epi64x( 0x0EA9E82C0EA9E82C );
sc->A[ 6] = _mm_set1_epi64x( 0xDBB930F1DBB930F1 );
sc->A[ 7] = _mm_set1_epi64x( 0xFAD06B8BFAD06B8B );
sc->A[ 8] = _mm_set1_epi64x( 0xBE0CAE40BE0CAE40 );
sc->A[ 9] = _mm_set1_epi64x( 0x8BD144108BD14410 );
sc->A[10] = _mm_set1_epi64x( 0x76D2ADAC76D2ADAC );
sc->A[11] = _mm_set1_epi64x( 0x28ACAB7F28ACAB7F );
sc->B[ 0] = m128_const1_64( 0xC1099CB7C1099CB7 );
sc->B[ 1] = m128_const1_64( 0x07B385F307B385F3 );
sc->B[ 2] = m128_const1_64( 0xE7442C26E7442C26 );
sc->B[ 3] = m128_const1_64( 0xCC8AD640CC8AD640 );
sc->B[ 4] = m128_const1_64( 0xEB6F56C7EB6F56C7 );
sc->B[ 5] = m128_const1_64( 0x1EA81AA91EA81AA9 );
sc->B[ 6] = m128_const1_64( 0x73B9D31473B9D314 );
sc->B[ 7] = m128_const1_64( 0x1DE85D081DE85D08 );
sc->B[ 8] = m128_const1_64( 0x48910A5A48910A5A );
sc->B[ 9] = m128_const1_64( 0x893B22DB893B22DB );
sc->B[10] = m128_const1_64( 0xC5A0DF44C5A0DF44 );
sc->B[11] = m128_const1_64( 0xBBC4324EBBC4324E );
sc->B[12] = m128_const1_64( 0x72D2F24072D2F240 );
sc->B[13] = m128_const1_64( 0x75941D9975941D99 );
sc->B[14] = m128_const1_64( 0x6D8BDE826D8BDE82 );
sc->B[15] = m128_const1_64( 0xA1A7502BA1A7502B );
sc->B[ 0] = _mm_set1_epi64x( 0xC1099CB7C1099CB7 );
sc->B[ 1] = _mm_set1_epi64x( 0x07B385F307B385F3 );
sc->B[ 2] = _mm_set1_epi64x( 0xE7442C26E7442C26 );
sc->B[ 3] = _mm_set1_epi64x( 0xCC8AD640CC8AD640 );
sc->B[ 4] = _mm_set1_epi64x( 0xEB6F56C7EB6F56C7 );
sc->B[ 5] = _mm_set1_epi64x( 0x1EA81AA91EA81AA9 );
sc->B[ 6] = _mm_set1_epi64x( 0x73B9D31473B9D314 );
sc->B[ 7] = _mm_set1_epi64x( 0x1DE85D081DE85D08 );
sc->B[ 8] = _mm_set1_epi64x( 0x48910A5A48910A5A );
sc->B[ 9] = _mm_set1_epi64x( 0x893B22DB893B22DB );
sc->B[10] = _mm_set1_epi64x( 0xC5A0DF44C5A0DF44 );
sc->B[11] = _mm_set1_epi64x( 0xBBC4324EBBC4324E );
sc->B[12] = _mm_set1_epi64x( 0x72D2F24072D2F240 );
sc->B[13] = _mm_set1_epi64x( 0x75941D9975941D99 );
sc->B[14] = _mm_set1_epi64x( 0x6D8BDE826D8BDE82 );
sc->B[15] = _mm_set1_epi64x( 0xA1A7502BA1A7502B );
sc->C[ 0] = m128_const1_64( 0xD9BF68D1D9BF68D1 );
sc->C[ 1] = m128_const1_64( 0x58BAD75058BAD750 );
sc->C[ 2] = m128_const1_64( 0x56028CB256028CB2 );
sc->C[ 3] = m128_const1_64( 0x8134F3598134F359 );
sc->C[ 4] = m128_const1_64( 0xB5D469D8B5D469D8 );
sc->C[ 5] = m128_const1_64( 0x941A8CC2941A8CC2 );
sc->C[ 6] = m128_const1_64( 0x418B2A6E418B2A6E );
sc->C[ 7] = m128_const1_64( 0x0405278004052780 );
sc->C[ 8] = m128_const1_64( 0x7F07D7877F07D787 );
sc->C[ 9] = m128_const1_64( 0x5194358F5194358F );
sc->C[10] = m128_const1_64( 0x3C60D6653C60D665 );
sc->C[11] = m128_const1_64( 0xBE97D79ABE97D79A );
sc->C[12] = m128_const1_64( 0x950C3434950C3434 );
sc->C[13] = m128_const1_64( 0xAED9A06DAED9A06D );
sc->C[14] = m128_const1_64( 0x2537DC8D2537DC8D );
sc->C[15] = m128_const1_64( 0x7CDB59697CDB5969 );
sc->C[ 0] = _mm_set1_epi64x( 0xD9BF68D1D9BF68D1 );
sc->C[ 1] = _mm_set1_epi64x( 0x58BAD75058BAD750 );
sc->C[ 2] = _mm_set1_epi64x( 0x56028CB256028CB2 );
sc->C[ 3] = _mm_set1_epi64x( 0x8134F3598134F359 );
sc->C[ 4] = _mm_set1_epi64x( 0xB5D469D8B5D469D8 );
sc->C[ 5] = _mm_set1_epi64x( 0x941A8CC2941A8CC2 );
sc->C[ 6] = _mm_set1_epi64x( 0x418B2A6E418B2A6E );
sc->C[ 7] = _mm_set1_epi64x( 0x0405278004052780 );
sc->C[ 8] = _mm_set1_epi64x( 0x7F07D7877F07D787 );
sc->C[ 9] = _mm_set1_epi64x( 0x5194358F5194358F );
sc->C[10] = _mm_set1_epi64x( 0x3C60D6653C60D665 );
sc->C[11] = _mm_set1_epi64x( 0xBE97D79ABE97D79A );
sc->C[12] = _mm_set1_epi64x( 0x950C3434950C3434 );
sc->C[13] = _mm_set1_epi64x( 0xAED9A06DAED9A06D );
sc->C[14] = _mm_set1_epi64x( 0x2537DC8D2537DC8D );
sc->C[15] = _mm_set1_epi64x( 0x7CDB59697CDB5969 );
*/
}
else
{ // No users
sc->state_loaded = true;
sc->A[ 0] = m128_const1_64( 0x52F8455252F84552 );
sc->A[ 1] = m128_const1_64( 0xE54B7999E54B7999 );
sc->A[ 2] = m128_const1_64( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = m128_const1_64( 0xB9645191B9645191 );
sc->A[ 4] = m128_const1_64( 0xE0078B86E0078B86 );
sc->A[ 5] = m128_const1_64( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = m128_const1_64( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = m128_const1_64( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = m128_const1_64( 0x14CE5A4514CE5A45 );
sc->A[ 9] = m128_const1_64( 0x22AF50DC22AF50DC );
sc->A[10] = m128_const1_64( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = m128_const1_64( 0xEB21B74AEB21B74A );
sc->A[ 0] = _mm_set1_epi64x( 0x52F8455252F84552 );
sc->A[ 1] = _mm_set1_epi64x( 0xE54B7999E54B7999 );
sc->A[ 2] = _mm_set1_epi64x( 0x2D8EE3EC2D8EE3EC );
sc->A[ 3] = _mm_set1_epi64x( 0xB9645191B9645191 );
sc->A[ 4] = _mm_set1_epi64x( 0xE0078B86E0078B86 );
sc->A[ 5] = _mm_set1_epi64x( 0xBB7C44C9BB7C44C9 );
sc->A[ 6] = _mm_set1_epi64x( 0xD2B5C1CAD2B5C1CA );
sc->A[ 7] = _mm_set1_epi64x( 0xB0D2EB8CB0D2EB8C );
sc->A[ 8] = _mm_set1_epi64x( 0x14CE5A4514CE5A45 );
sc->A[ 9] = _mm_set1_epi64x( 0x22AF50DC22AF50DC );
sc->A[10] = _mm_set1_epi64x( 0xEFFDBC6BEFFDBC6B );
sc->A[11] = _mm_set1_epi64x( 0xEB21B74AEB21B74A );
sc->B[ 0] = m128_const1_64( 0xB555C6EEB555C6EE );
sc->B[ 1] = m128_const1_64( 0x3E7105963E710596 );
sc->B[ 2] = m128_const1_64( 0xA72A652FA72A652F );
sc->B[ 3] = m128_const1_64( 0x9301515F9301515F );
sc->B[ 4] = m128_const1_64( 0xDA28C1FADA28C1FA );
sc->B[ 5] = m128_const1_64( 0x696FD868696FD868 );
sc->B[ 6] = m128_const1_64( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = m128_const1_64( 0x0AFE40020AFE4002 );
sc->B[ 8] = m128_const1_64( 0xA6E03615A6E03615 );
sc->B[ 9] = m128_const1_64( 0x5138C1D45138C1D4 );
sc->B[10] = m128_const1_64( 0xBE216306BE216306 );
sc->B[11] = m128_const1_64( 0xB38B8890B38B8890 );
sc->B[12] = m128_const1_64( 0x3EA8B96B3EA8B96B );
sc->B[13] = m128_const1_64( 0x3299ACE43299ACE4 );
sc->B[14] = m128_const1_64( 0x30924DD430924DD4 );
sc->B[15] = m128_const1_64( 0x55CB34A555CB34A5 );
sc->B[ 0] = _mm_set1_epi64x( 0xB555C6EEB555C6EE );
sc->B[ 1] = _mm_set1_epi64x( 0x3E7105963E710596 );
sc->B[ 2] = _mm_set1_epi64x( 0xA72A652FA72A652F );
sc->B[ 3] = _mm_set1_epi64x( 0x9301515F9301515F );
sc->B[ 4] = _mm_set1_epi64x( 0xDA28C1FADA28C1FA );
sc->B[ 5] = _mm_set1_epi64x( 0x696FD868696FD868 );
sc->B[ 6] = _mm_set1_epi64x( 0x9CB6BF729CB6BF72 );
sc->B[ 7] = _mm_set1_epi64x( 0x0AFE40020AFE4002 );
sc->B[ 8] = _mm_set1_epi64x( 0xA6E03615A6E03615 );
sc->B[ 9] = _mm_set1_epi64x( 0x5138C1D45138C1D4 );
sc->B[10] = _mm_set1_epi64x( 0xBE216306BE216306 );
sc->B[11] = _mm_set1_epi64x( 0xB38B8890B38B8890 );
sc->B[12] = _mm_set1_epi64x( 0x3EA8B96B3EA8B96B );
sc->B[13] = _mm_set1_epi64x( 0x3299ACE43299ACE4 );
sc->B[14] = _mm_set1_epi64x( 0x30924DD430924DD4 );
sc->B[15] = _mm_set1_epi64x( 0x55CB34A555CB34A5 );
sc->C[ 0] = m128_const1_64( 0xB405F031B405F031 );
sc->C[ 1] = m128_const1_64( 0xC4233EBAC4233EBA );
sc->C[ 2] = m128_const1_64( 0xB3733979B3733979 );
sc->C[ 3] = m128_const1_64( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = m128_const1_64( 0xC51C28AEC51C28AE );
sc->C[ 5] = m128_const1_64( 0xA327B8E1A327B8E1 );
sc->C[ 6] = m128_const1_64( 0x56C5616756C56167 );
sc->C[ 7] = m128_const1_64( 0xED614433ED614433 );
sc->C[ 8] = m128_const1_64( 0x88B59D6088B59D60 );
sc->C[ 9] = m128_const1_64( 0x60E2CEBA60E2CEBA );
sc->C[10] = m128_const1_64( 0x758B4B8B758B4B8B );
sc->C[11] = m128_const1_64( 0x83E82A7F83E82A7F );
sc->C[12] = m128_const1_64( 0xBC968828BC968828 );
sc->C[13] = m128_const1_64( 0xE6E00BF7E6E00BF7 );
sc->C[14] = m128_const1_64( 0xBA839E55BA839E55 );
sc->C[15] = m128_const1_64( 0x9B491C609B491C60 );
sc->C[ 0] = _mm_set1_epi64x( 0xB405F031B405F031 );
sc->C[ 1] = _mm_set1_epi64x( 0xC4233EBAC4233EBA );
sc->C[ 2] = _mm_set1_epi64x( 0xB3733979B3733979 );
sc->C[ 3] = _mm_set1_epi64x( 0xC0DD9D55C0DD9D55 );
sc->C[ 4] = _mm_set1_epi64x( 0xC51C28AEC51C28AE );
sc->C[ 5] = _mm_set1_epi64x( 0xA327B8E1A327B8E1 );
sc->C[ 6] = _mm_set1_epi64x( 0x56C5616756C56167 );
sc->C[ 7] = _mm_set1_epi64x( 0xED614433ED614433 );
sc->C[ 8] = _mm_set1_epi64x( 0x88B59D6088B59D60 );
sc->C[ 9] = _mm_set1_epi64x( 0x60E2CEBA60E2CEBA );
sc->C[10] = _mm_set1_epi64x( 0x758B4B8B758B4B8B );
sc->C[11] = _mm_set1_epi64x( 0x83E82A7F83E82A7F );
sc->C[12] = _mm_set1_epi64x( 0xBC968828BC968828 );
sc->C[13] = _mm_set1_epi64x( 0xE6E00BF7E6E00BF7 );
sc->C[14] = _mm_set1_epi64x( 0xBA839E55BA839E55 );
sc->C[15] = _mm_set1_epi64x( 0x9B491C609B491C60 );
}
sc->Wlow = 1;
sc->Whigh = 0;

View File

@@ -32,6 +32,44 @@ static const uint32_t IV512[] =
#endif
#if defined (__AVX512VL__)
//TODO Enable for AVX10_256
#define DECL_m256i_count \
const __m256i count = \
mm256_set4_32( ctx->count3, ctx->count2, ctx->count1, ctx->count0 );
#define COUNT_R0 \
_mm256_mask_xor_epi32( count, 0x88, count, m256_neg1 )
#define COUNT_R1 \
mm256_shuflr128_32( _mm256_mask_xor_epi32( count, 0x11, count, m256_neg1 ) )
#define COUNT_R2 \
mm256_swap128_64( _mm256_mask_xor_epi32( count, 0x22, count, m256_neg1 ) )
#define COUNT_R13 \
mm256_swap64_32( _mm256_mask_xor_epi32( count, 0x44, count, m256_neg1 ) )
#else
#define DECL_m256i_count
// R matches the loop index not the round number, should changet that
#define COUNT_R0 \
mm256_set4_32( ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 )
#define COUNT_R1 \
mm256_set4_32( ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 )
#define COUNT_R2 \
mm256_set4_32( ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 )
#define COUNT_R13 \
mm256_set4_32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 )
#endif
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
@@ -40,6 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
__m256i *h = (__m256i*)ctx->h;
DECL_m256i_count;
int r;
p0 = h[0];
@@ -47,7 +86,8 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
p2 = h[2];
p3 = h[3];
// round
// round 0
k00 = m[0];
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
k01 = m[1];
@@ -78,18 +118,14 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
mm256_aesenc_2x128( k00, zero ) ) );
if ( r == 0 )
k00 = _mm256_xor_si256( k00, _mm256_set_epi32(
~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
k00 = _mm256_xor_si256( k00, COUNT_R0 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
k01 = _mm256_xor_si256( k00,
mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );
if ( r == 1 )
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
k01 = _mm256_xor_si256( k01, COUNT_R1 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
k02 = _mm256_xor_si256( k01,
@@ -114,9 +150,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );
if ( r == 2 )
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
k13 = _mm256_xor_si256( k13, COUNT_R2 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
p1 = _mm256_xor_si256( p1, x );
@@ -228,9 +262,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, COUNT_R13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
k13 = _mm256_xor_si256( mm256_shuflr128_32(

View File

@@ -204,11 +204,9 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, mm512_swap64_32(
_mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
K7= _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );

View File

@@ -212,14 +212,24 @@ do { \
// targetted
#define shufxor2w(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, m256_const1_64( \
_mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
_mm256_srai_epi16( x, 8 ) )
#else
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
#endif
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \
m256_const1_64( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, m256_const1_64( 0x0080008000800080 ) ) ) )
_mm256_set1_epi64x( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -387,17 +397,11 @@ static const m512_v16 FFT256_Twiddle4w[] =
_mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
_mm512_srai_epi16( x, 8 ) )
/*
#define REDUCE4w(x) \
_mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
*/
#define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \
m512_const1_64( 0x0101010101010101 ), \
_mm512_set1_epi64( 0x0101010101010101 ), \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
x, m512_const1_64( 0x0080008000800080 ) ) ) ) )
x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
// generic, except it calls targetted macros
#define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) )

View File

@@ -63,7 +63,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
@@ -151,7 +151,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

View File

@@ -285,7 +285,7 @@ static const uint64_t IV512[] = {
#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
#define READ_STATE_BIG(sc) do { \
#define READ_STATE_BIG(sc) \
h0 = (sc)->h0; \
h1 = (sc)->h1; \
h2 = (sc)->h2; \
@@ -294,10 +294,9 @@ static const uint64_t IV512[] = {
h5 = (sc)->h5; \
h6 = (sc)->h6; \
h7 = (sc)->h7; \
bcount = sc->bcount; \
} while (0)
bcount = sc->bcount;
#define WRITE_STATE_BIG(sc) do { \
#define WRITE_STATE_BIG(sc) \
(sc)->h0 = h0; \
(sc)->h1 = h1; \
(sc)->h2 = h2; \
@@ -306,62 +305,54 @@ static const uint64_t IV512[] = {
(sc)->h5 = h5; \
(sc)->h6 = h6; \
(sc)->h7 = h7; \
sc->bcount = bcount; \
} while (0)
sc->bcount = bcount;
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
do { \
k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
t2 = t0 ^ t1; \
} while (0)
k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), \
mm512_xor3( k3, k4, k5 ), \
mm512_xor3( k6, k7, \
_mm512_set1_epi64( 0x1BD11BDAA9FC1A22) ) ); \
t2 = t0 ^ t1;
#define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
do { \
w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \
w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \
w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \
w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \
w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \
m512_const1_64( SKBT(t,s,0) ) ) ); \
_mm512_set1_epi64( SKBT(t,s,0) ) ) ); \
w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \
m512_const1_64( SKBT(t,s,1) ) ) ); \
_mm512_set1_epi64( SKBT(t,s,1) ) ) ); \
w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \
m512_const1_64( s ) ) ); \
} while (0)
_mm512_set1_epi64( s ) ) );
#define TFBIG_MIX_8WAY(x0, x1, rc) \
do { \
x0 = _mm512_add_epi64( x0, x1 ); \
x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \
} while (0)
x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 );
#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) \
TFBIG_MIX_8WAY(w0, w1, rc0); \
TFBIG_MIX_8WAY(w2, w3, rc1); \
TFBIG_MIX_8WAY(w4, w5, rc2); \
TFBIG_MIX_8WAY(w6, w7, rc3); \
} while (0)
TFBIG_MIX_8WAY(w6, w7, rc3);
#define TFBIG_8WAY_4e(s) do { \
#define TFBIG_8WAY_4e(s) \
TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
} while (0)
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
#define TFBIG_8WAY_4o(s) do { \
#define TFBIG_8WAY_4o(s) \
TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
} while (0)
TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
#define UBI_BIG_8WAY(etype, extra) \
do { \
@@ -424,59 +415,48 @@ do { \
#endif // AVX512
#define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
do { \
k8 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( _mm256_xor_si256( k0, k1 ), \
_mm256_xor_si256( k2, k3 ) ), \
_mm256_xor_si256( _mm256_xor_si256( k4, k5 ), \
_mm256_xor_si256( k6, k7 ) ) ), \
m256_const1_64( 0x1BD11BDAA9FC1A22) ); \
t2 = t0 ^ t1; \
} while (0)
k8 = mm256_xor3( mm256_xor3( k0, k1, k2 ), \
mm256_xor3( k3, k4, k5 ), \
mm256_xor3( k6, k7, \
_mm256_set1_epi64x( 0x1BD11BDAA9FC1A22) ) ); \
t2 = t0 ^ t1;
#define TFBIG_ADDKEY_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
do { \
w0 = _mm256_add_epi64( w0, SKBI(k,s,0) ); \
w1 = _mm256_add_epi64( w1, SKBI(k,s,1) ); \
w2 = _mm256_add_epi64( w2, SKBI(k,s,2) ); \
w3 = _mm256_add_epi64( w3, SKBI(k,s,3) ); \
w4 = _mm256_add_epi64( w4, SKBI(k,s,4) ); \
w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
m256_const1_64( SKBT(t,s,0) ) ) ); \
_mm256_set1_epi64x( SKBT(t,s,0) ) ) ); \
w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
m256_const1_64( SKBT(t,s,1) ) ) ); \
_mm256_set1_epi64x( SKBT(t,s,1) ) ) ); \
w7 = _mm256_add_epi64( w7, _mm256_add_epi64( SKBI(k,s,7), \
m256_const1_64( s ) ) ); \
} while (0)
_mm256_set1_epi64x( s ) ) );
#define TFBIG_MIX_4WAY(x0, x1, rc) \
do { \
x0 = _mm256_add_epi64( x0, x1 ); \
x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \
} while (0)
x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 );
#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) \
TFBIG_MIX_4WAY(w0, w1, rc0); \
TFBIG_MIX_4WAY(w2, w3, rc1); \
TFBIG_MIX_4WAY(w4, w5, rc2); \
TFBIG_MIX_4WAY(w6, w7, rc3); \
} while (0)
TFBIG_MIX_4WAY(w6, w7, rc3);
#define TFBIG_4WAY_4e(s) do { \
#define TFBIG_4WAY_4e(s) \
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
} while (0)
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
#define TFBIG_4WAY_4o(s) do { \
#define TFBIG_4WAY_4o(s) \
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
} while (0)
TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
// scale buf offset by 4
#define UBI_BIG_4WAY(etype, extra) \
@@ -541,28 +521,28 @@ do { \
void skein256_8way_init( skein256_8way_context *sc )
{
sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 );
sc->h1 = m512_const1_64( 0xE83590301A79A9EB );
sc->h2 = m512_const1_64( 0x55AEA0614F816E6F );
sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB );
sc->h4 = m512_const1_64( 0xEC06025E74DD7683 );
sc->h5 = m512_const1_64( 0xE7A436CDC4746251 );
sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 );
sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 );
sc->h0 = _mm512_set1_epi64( 0xCCD044A12FDB3E13 );
sc->h1 = _mm512_set1_epi64( 0xE83590301A79A9EB );
sc->h2 = _mm512_set1_epi64( 0x55AEA0614F816E6F );
sc->h3 = _mm512_set1_epi64( 0x2A2767A4AE9B94DB );
sc->h4 = _mm512_set1_epi64( 0xEC06025E74DD7683 );
sc->h5 = _mm512_set1_epi64( 0xE7A436CDC4746251 );
sc->h6 = _mm512_set1_epi64( 0xC36FBAF9393AD185 );
sc->h7 = _mm512_set1_epi64( 0x3EEDBA1833EDFC13 );
sc->bcount = 0;
sc->ptr = 0;
}
void skein512_8way_init( skein512_8way_context *sc )
{
sc->h0 = m512_const1_64( 0x4903ADFF749C51CE );
sc->h1 = m512_const1_64( 0x0D95DE399746DF03 );
sc->h2 = m512_const1_64( 0x8FD1934127C79BCE );
sc->h3 = m512_const1_64( 0x9A255629FF352CB1 );
sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
sc->h6 = m512_const1_64( 0x991112C71A75B523 );
sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 );
sc->h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
sc->h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
sc->h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
sc->h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
sc->h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
sc->h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
sc->h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
sc->h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
sc->bcount = 0;
sc->ptr = 0;
}
@@ -660,14 +640,14 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,
// Init
h0 = m512_const1_64( 0x4903ADFF749C51CE );
h1 = m512_const1_64( 0x0D95DE399746DF03 );
h2 = m512_const1_64( 0x8FD1934127C79BCE );
h3 = m512_const1_64( 0x9A255629FF352CB1 );
h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
h6 = m512_const1_64( 0x991112C71A75B523 );
h7 = m512_const1_64( 0xAE18A40B660FCC33 );
h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
// Update
@@ -734,14 +714,14 @@ skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
buf[5] = vdata[5];
buf[6] = vdata[6];
buf[7] = vdata[7];
register __m512i h0 = m512_const1_64( 0x4903ADFF749C51CE );
register __m512i h1 = m512_const1_64( 0x0D95DE399746DF03 );
register __m512i h2 = m512_const1_64( 0x8FD1934127C79BCE );
register __m512i h3 = m512_const1_64( 0x9A255629FF352CB1 );
register __m512i h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
register __m512i h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
register __m512i h6 = m512_const1_64( 0x991112C71A75B523 );
register __m512i h7 = m512_const1_64( 0xAE18A40B660FCC33 );
register __m512i h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
register __m512i h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
register __m512i h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
register __m512i h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
register __m512i h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
register __m512i h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
register __m512i h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
register __m512i h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
uint64_t bcount = 1;
UBI_BIG_8WAY( 224, 0 );
@@ -830,28 +810,28 @@ skein512_8way_close(void *cc, void *dst)
void skein256_4way_init( skein256_4way_context *sc )
{
sc->h0 = m256_const1_64( 0xCCD044A12FDB3E13 );
sc->h1 = m256_const1_64( 0xE83590301A79A9EB );
sc->h2 = m256_const1_64( 0x55AEA0614F816E6F );
sc->h3 = m256_const1_64( 0x2A2767A4AE9B94DB );
sc->h4 = m256_const1_64( 0xEC06025E74DD7683 );
sc->h5 = m256_const1_64( 0xE7A436CDC4746251 );
sc->h6 = m256_const1_64( 0xC36FBAF9393AD185 );
sc->h7 = m256_const1_64( 0x3EEDBA1833EDFC13 );
sc->h0 = _mm256_set1_epi64x( 0xCCD044A12FDB3E13 );
sc->h1 = _mm256_set1_epi64x( 0xE83590301A79A9EB );
sc->h2 = _mm256_set1_epi64x( 0x55AEA0614F816E6F );
sc->h3 = _mm256_set1_epi64x( 0x2A2767A4AE9B94DB );
sc->h4 = _mm256_set1_epi64x( 0xEC06025E74DD7683 );
sc->h5 = _mm256_set1_epi64x( 0xE7A436CDC4746251 );
sc->h6 = _mm256_set1_epi64x( 0xC36FBAF9393AD185 );
sc->h7 = _mm256_set1_epi64x( 0x3EEDBA1833EDFC13 );
sc->bcount = 0;
sc->ptr = 0;
}
void skein512_4way_init( skein512_4way_context *sc )
{
sc->h0 = m256_const1_64( 0x4903ADFF749C51CE );
sc->h1 = m256_const1_64( 0x0D95DE399746DF03 );
sc->h2 = m256_const1_64( 0x8FD1934127C79BCE );
sc->h3 = m256_const1_64( 0x9A255629FF352CB1 );
sc->h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
sc->h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
sc->h6 = m256_const1_64( 0x991112C71A75B523 );
sc->h7 = m256_const1_64( 0xAE18A40B660FCC33 );
sc->h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
sc->h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
sc->h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
sc->h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
sc->h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
sc->h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
sc->h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
sc->h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
sc->bcount = 0;
sc->ptr = 0;
}
@@ -954,14 +934,14 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
const int buf_size = 64; // 64 * __m256i
uint64_t bcount = 0;
h0 = m256_const1_64( 0x4903ADFF749C51CE );
h1 = m256_const1_64( 0x0D95DE399746DF03 );
h2 = m256_const1_64( 0x8FD1934127C79BCE );
h3 = m256_const1_64( 0x9A255629FF352CB1 );
h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
h6 = m256_const1_64( 0x991112C71A75B523 );
h7 = m256_const1_64( 0xAE18A40B660FCC33 );
h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
// Update
@@ -1028,14 +1008,14 @@ skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
buf[5] = vdata[5];
buf[6] = vdata[6];
buf[7] = vdata[7];
register __m256i h0 = m256_const1_64( 0x4903ADFF749C51CE );
register __m256i h1 = m256_const1_64( 0x0D95DE399746DF03 );
register __m256i h2 = m256_const1_64( 0x8FD1934127C79BCE );
register __m256i h3 = m256_const1_64( 0x9A255629FF352CB1 );
register __m256i h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
register __m256i h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
register __m256i h6 = m256_const1_64( 0x991112C71A75B523 );
register __m256i h7 = m256_const1_64( 0xAE18A40B660FCC33 );
register __m256i h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
register __m256i h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
register __m256i h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
register __m256i h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
register __m256i h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
register __m256i h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
register __m256i h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
register __m256i h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
uint64_t bcount = 1;
UBI_BIG_4WAY( 224, 0 );

View File

@@ -57,7 +57,7 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
@@ -119,7 +119,7 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );

View File

@@ -630,36 +630,35 @@ void InitializeSWIFFTX()
}
// In the original code the F matrix is rotated so it was not aranged
// the same as all the other data. Rearanging F to match all the other
// data made vectorizing possible, the compiler probably could have been
// able to auto-vectorize with proper data organisation.
// Also in the original code the custom 16 bit data types are all now 32
// bit int32_t regardless of the type name.
//
void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
// the same as the other data. Rearanging F made vectorizing up to 256 bits
// possible.
// Also in the original code the custom 16 bit data types are all now aliased
// to 32 bit int32_t.
void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
{
#if defined(__AVX2__)
__m256i F[8] __attribute__ ((aligned (64)));
__m256i F0, F1, F2, F3, F4, F5, F6, F7;
__m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
__m256i *mul = (__m256i*)multipliers;
__m256i *out = (__m256i*)output;
__m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );
F[0] = _mm256_mullo_epi32( mul[0], *tbl );
tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
F[1] = _mm256_mullo_epi32( mul[1], *tbl );
tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
F[2] = _mm256_mullo_epi32( mul[2], *tbl );
tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
F[3] = _mm256_mullo_epi32( mul[3], *tbl );
tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
F[4] = _mm256_mullo_epi32( mul[4], *tbl );
tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
F[5] = _mm256_mullo_epi32( mul[5], *tbl );
tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
F[6] = _mm256_mullo_epi32( mul[6], *tbl );
tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
F[7] = _mm256_mullo_epi32( mul[7], *tbl );
F0 = _mm256_mullo_epi32( mul[0], tbl );
tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
F1 = _mm256_mullo_epi32( mul[1], tbl );
tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
F2 = _mm256_mullo_epi32( mul[2], tbl );
tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
F3 = _mm256_mullo_epi32( mul[3], tbl );
tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
F4 = _mm256_mullo_epi32( mul[4], tbl );
tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
F5 = _mm256_mullo_epi32( mul[5], tbl );
tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
F6 = _mm256_mullo_epi32( mul[6], tbl );
tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
F7 = _mm256_mullo_epi32( mul[7], tbl );
#define ADD_SUB( a, b ) \
{ \
@@ -668,52 +667,50 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
a = _mm256_add_epi32( a, tmp ); \
}
ADD_SUB( F[0], F[1] );
ADD_SUB( F[2], F[3] );
ADD_SUB( F[4], F[5] );
ADD_SUB( F[6], F[7] );
F[3] = _mm256_slli_epi32( F[3], 4 );
F[7] = _mm256_slli_epi32( F[7], 4 );
ADD_SUB( F[0], F[2] );
ADD_SUB( F[1], F[3] );
ADD_SUB( F[4], F[6] );
ADD_SUB( F[5], F[7] );
F[5] = _mm256_slli_epi32( F[5], 2 );
F[6] = _mm256_slli_epi32( F[6], 4 );
F[7] = _mm256_slli_epi32( F[7], 6 );
ADD_SUB( F[0], F[4] );
ADD_SUB( F[1], F[5] );
ADD_SUB( F[2], F[6] );
ADD_SUB( F[3], F[7] );
ADD_SUB( F0, F1 );
ADD_SUB( F2, F3 );
ADD_SUB( F4, F5 );
ADD_SUB( F6, F7 );
F3 = _mm256_slli_epi32( F3, 4 );
F7 = _mm256_slli_epi32( F7, 4 );
ADD_SUB( F0, F2 );
ADD_SUB( F1, F3 );
ADD_SUB( F4, F6 );
ADD_SUB( F5, F7 );
F5 = _mm256_slli_epi32( F5, 2 );
F6 = _mm256_slli_epi32( F6, 4 );
F7 = _mm256_slli_epi32( F7, 6 );
ADD_SUB( F0, F4 );
ADD_SUB( F1, F5 );
ADD_SUB( F2, F6 );
ADD_SUB( F3, F7 );
#undef ADD_SUB
#if defined (__AVX512VL__) && defined(__AVX512BW__)
const __m256i mask = _mm256_movm_epi8( 0x11111111 );
#define Q_REDUCE( a ) \
_mm256_sub_epi32( _mm256_maskz_mov_epi8( 0x11111111, a ), \
_mm256_srai_epi32( a, 8 ) )
#else
const __m256i mask = m256_const1_32( 0x000000ff );
#endif
const __m256i mask = _mm256_set1_epi32( 0x000000ff );
#define Q_REDUCE( a ) \
_mm256_sub_epi32( _mm256_and_si256( a, mask ), \
_mm256_srai_epi32( a, 8 ) )
#endif
out[0] = Q_REDUCE( F[0] );
out[1] = Q_REDUCE( F[1] );
out[2] = Q_REDUCE( F[2] );
out[3] = Q_REDUCE( F[3] );
out[4] = Q_REDUCE( F[4] );
out[5] = Q_REDUCE( F[5] );
out[6] = Q_REDUCE( F[6] );
out[7] = Q_REDUCE( F[7] );
out[0] = Q_REDUCE( F0 );
out[1] = Q_REDUCE( F1 );
out[2] = Q_REDUCE( F2 );
out[3] = Q_REDUCE( F3 );
out[4] = Q_REDUCE( F4 );
out[5] = Q_REDUCE( F5 );
out[6] = Q_REDUCE( F6 );
out[7] = Q_REDUCE( F7 );
#undef Q_REDUCE
@@ -763,12 +760,10 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
ADD_SUB( F[ 9], F[11] );
ADD_SUB( F[12], F[14] );
ADD_SUB( F[13], F[15] );
F[ 6] = _mm_slli_epi32( F[ 6], 4 );
F[ 7] = _mm_slli_epi32( F[ 7], 4 );
F[14] = _mm_slli_epi32( F[14], 4 );
F[15] = _mm_slli_epi32( F[15], 4 );
ADD_SUB( F[ 0], F[ 4] );
ADD_SUB( F[ 1], F[ 5] );
ADD_SUB( F[ 2], F[ 6] );
@@ -777,14 +772,12 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
ADD_SUB( F[ 9], F[13] );
ADD_SUB( F[10], F[14] );
ADD_SUB( F[11], F[15] );
F[10] = _mm_slli_epi32( F[10], 2 );
F[11] = _mm_slli_epi32( F[11], 2 );
F[12] = _mm_slli_epi32( F[12], 4 );
F[13] = _mm_slli_epi32( F[13], 4 );
F[14] = _mm_slli_epi32( F[14], 6 );
F[15] = _mm_slli_epi32( F[15], 6 );
ADD_SUB( F[ 0], F[ 8] );
ADD_SUB( F[ 1], F[ 9] );
ADD_SUB( F[ 2], F[10] );
@@ -796,7 +789,7 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
#undef ADD_SUB
const __m128i mask = m128_const1_32( 0x000000ff );
const __m128i mask = _mm_set1_epi32( 0x000000ff );
#define Q_REDUCE( a ) \
_mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) )
@@ -820,16 +813,13 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
#undef Q_REDUCE
#else // < SSE4.1
#else // AVX256 elif SSE4_1
swift_int16_t *mult = multipliers;
// First loop unrolling:
register swift_int16_t *table = &(fftTable[input[0] << 3]);
/*
swift_int16_t *table = &( fftTable[ input[0] << 3 ] );
swift_int32_t F[64];
/*
for (int i = 0; i < 8; i++)
{
int j = i<<3;
@@ -845,99 +835,91 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
}
*/
register swift_int32_t F0, F1, F2, F3, F4, F5, F6, F7, F8, F9,
F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
F20, F21, F22, F23, F24, F25, F26, F27, F28, F29,
F30, F31, F32, F33, F34, F35, F36, F37, F38, F39,
F40, F41, F42, F43, F44, F45, F46, F47, F48, F49,
F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
F60, F61, F62, F63;
F0 = mult[0] * table[0];
F8 = mult[1] * table[1];
F16 = mult[2] * table[2];
F24 = mult[3] * table[3];
F32 = mult[4] * table[4];
F40 = mult[5] * table[5];
F48 = mult[6] * table[6];
F56 = mult[7] * table[7];
F[ 0] = mult[ 0] * table[0];
F[ 8] = mult[ 1] * table[1];
F[16] = mult[ 2] * table[2];
F[24] = mult[ 3] * table[3];
F[32] = mult[ 4] * table[4];
F[40] = mult[ 5] * table[5];
F[48] = mult[ 6] * table[6];
F[56] = mult[ 7] * table[7];
table = &(fftTable[input[1] << 3]);
F1 = mult[ 8] * table[0];
F9 = mult[ 9] * table[1];
F17 = mult[10] * table[2];
F25 = mult[11] * table[3];
F33 = mult[12] * table[4];
F41 = mult[13] * table[5];
F49 = mult[14] * table[6];
F57 = mult[15] * table[7];
F[ 1] = mult[ 8] * table[0];
F[ 9] = mult[ 9] * table[1];
F[17] = mult[10] * table[2];
F[25] = mult[11] * table[3];
F[33] = mult[12] * table[4];
F[41] = mult[13] * table[5];
F[49] = mult[14] * table[6];
F[57] = mult[15] * table[7];
table = &(fftTable[input[2] << 3]);
F2 = mult[16] * table[0];
F10 = mult[17] * table[1];
F18 = mult[18] * table[2];
F26 = mult[19] * table[3];
F34 = mult[20] * table[4];
F42 = mult[21] * table[5];
F50 = mult[22] * table[6];
F58 = mult[23] * table[7];
F[ 2] = mult[16] * table[0];
F[10] = mult[17] * table[1];
F[18] = mult[18] * table[2];
F[26] = mult[19] * table[3];
F[34] = mult[20] * table[4];
F[42] = mult[21] * table[5];
F[50] = mult[22] * table[6];
F[58] = mult[23] * table[7];
table = &(fftTable[input[3] << 3]);
F3 = mult[24] * table[0];
F11 = mult[25] * table[1];
F19 = mult[26] * table[2];
F27 = mult[27] * table[3];
F35 = mult[28] * table[4];
F43 = mult[29] * table[5];
F51 = mult[30] * table[6];
F59 = mult[31] * table[7];
F[ 3] = mult[24] * table[0];
F[11] = mult[25] * table[1];
F[19] = mult[26] * table[2];
F[27] = mult[27] * table[3];
F[35] = mult[28] * table[4];
F[43] = mult[29] * table[5];
F[51] = mult[30] * table[6];
F[59] = mult[31] * table[7];
table = &(fftTable[input[4] << 3]);
F4 = mult[32] * table[0];
F12 = mult[33] * table[1];
F20 = mult[34] * table[2];
F28 = mult[35] * table[3];
F36 = mult[36] * table[4];
F44 = mult[37] * table[5];
F52 = mult[38] * table[6];
F60 = mult[39] * table[7];
F[ 4] = mult[32] * table[0];
F[12] = mult[33] * table[1];
F[20] = mult[34] * table[2];
F[28] = mult[35] * table[3];
F[36] = mult[36] * table[4];
F[44] = mult[37] * table[5];
F[52] = mult[38] * table[6];
F[60] = mult[39] * table[7];
table = &(fftTable[input[5] << 3]);
F5 = mult[40] * table[0];
F13 = mult[41] * table[1];
F21 = mult[42] * table[2];
F29 = mult[43] * table[3];
F37 = mult[44] * table[4];
F45 = mult[45] * table[5];
F53 = mult[46] * table[6];
F61 = mult[47] * table[7];
F[ 5] = mult[40] * table[0];
F[13] = mult[41] * table[1];
F[21] = mult[42] * table[2];
F[29] = mult[43] * table[3];
F[37] = mult[44] * table[4];
F[45] = mult[45] * table[5];
F[53] = mult[46] * table[6];
F[61] = mult[47] * table[7];
table = &(fftTable[input[6] << 3]);
F6 = mult[48] * table[0];
F14 = mult[49] * table[1];
F22 = mult[50] * table[2];
F30 = mult[51] * table[3];
F38 = mult[52] * table[4];
F46 = mult[53] * table[5];
F54 = mult[54] * table[6];
F62 = mult[55] * table[7];
F[ 6] = mult[48] * table[0];
F[14] = mult[49] * table[1];
F[22] = mult[50] * table[2];
F[30] = mult[51] * table[3];
F[38] = mult[52] * table[4];
F[46] = mult[53] * table[5];
F[54] = mult[54] * table[6];
F[62] = mult[55] * table[7];
table = &(fftTable[input[7] << 3]);
F7 = mult[56] * table[0];
F15 = mult[57] * table[1];
F23 = mult[58] * table[2];
F31 = mult[59] * table[3];
F39 = mult[60] * table[4];
F47 = mult[61] * table[5];
F55 = mult[62] * table[6];
F63 = mult[63] * table[7];
F[ 7] = mult[56] * table[0];
F[15] = mult[57] * table[1];
F[23] = mult[58] * table[2];
F[31] = mult[59] * table[3];
F[39] = mult[60] * table[4];
F[47] = mult[61] * table[5];
F[55] = mult[62] * table[6];
F[63] = mult[63] * table[7];
#define ADD_SUB( a, b ) \
{ \
@@ -987,262 +969,229 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
}
*/
// Second loop unrolling:
// Iteration 0:
ADD_SUB(F0, F1);
ADD_SUB(F2, F3);
ADD_SUB(F4, F5);
ADD_SUB(F6, F7);
ADD_SUB( F[ 0], F[ 1] );
ADD_SUB( F[ 2], F[ 3] );
ADD_SUB( F[ 4], F[ 5] );
ADD_SUB( F[ 6], F[ 7] );
F[ 3] <<= 4;
F[ 7] <<= 4;
ADD_SUB( F[ 0], F[ 2] );
ADD_SUB( F[ 1], F[ 3] );
ADD_SUB( F[ 4], F[ 6] );
ADD_SUB( F[ 5], F[ 7] );
F[ 5] <<= 2;
F[ 6] <<= 4;
F[ 7] <<= 6;
ADD_SUB( F[ 0], F[ 4] );
ADD_SUB( F[ 1], F[ 5] );
ADD_SUB( F[ 2], F[ 6] );
ADD_SUB( F[ 3], F[ 7] );
F3 <<= 4;
F7 <<= 4;
ADD_SUB(F0, F2);
ADD_SUB(F1, F3);
ADD_SUB(F4, F6);
ADD_SUB(F5, F7);
F5 <<= 2;
F6 <<= 4;
F7 <<= 6;
ADD_SUB(F0, F4);
ADD_SUB(F1, F5);
ADD_SUB(F2, F6);
ADD_SUB(F3, F7);
output[0] = Q_REDUCE(F0);
output[8] = Q_REDUCE(F1);
output[16] = Q_REDUCE(F2);
output[24] = Q_REDUCE(F3);
output[32] = Q_REDUCE(F4);
output[40] = Q_REDUCE(F5);
output[48] = Q_REDUCE(F6);
output[56] = Q_REDUCE(F7);
output[ 0] = Q_REDUCE( F[ 0] );
output[ 8] = Q_REDUCE( F[ 1] );
output[16] = Q_REDUCE( F[ 2] );
output[24] = Q_REDUCE( F[ 3] );
output[32] = Q_REDUCE( F[ 4] );
output[40] = Q_REDUCE( F[ 5] );
output[48] = Q_REDUCE( F[ 6] );
output[56] = Q_REDUCE( F[ 7] );
// Iteration 1:
ADD_SUB(F8, F9);
ADD_SUB(F10, F11);
ADD_SUB(F12, F13);
ADD_SUB(F14, F15);
ADD_SUB( F[ 8], F[ 9] );
ADD_SUB( F[10], F[11] );
ADD_SUB( F[12], F[13] );
ADD_SUB( F[14], F[15] );
F[11] <<= 4;
F[15] <<= 4;
ADD_SUB( F[ 8], F[10] );
ADD_SUB( F[ 9], F[11] );
ADD_SUB( F[12], F[14] );
ADD_SUB( F[13], F[15] );
F[13] <<= 2;
F[14] <<= 4;
F[15] <<= 6;
ADD_SUB( F[ 8], F[12] );
ADD_SUB( F[ 9], F[13] );
ADD_SUB( F[10], F[14] );
ADD_SUB( F[11], F[15] );
F11 <<= 4;
F15 <<= 4;
ADD_SUB(F8, F10);
ADD_SUB(F9, F11);
ADD_SUB(F12, F14);
ADD_SUB(F13, F15);
F13 <<= 2;
F14 <<= 4;
F15 <<= 6;
ADD_SUB(F8, F12);
ADD_SUB(F9, F13);
ADD_SUB(F10, F14);
ADD_SUB(F11, F15);
output[1] = Q_REDUCE(F8);
output[9] = Q_REDUCE(F9);
output[17] = Q_REDUCE(F10);
output[25] = Q_REDUCE(F11);
output[33] = Q_REDUCE(F12);
output[41] = Q_REDUCE(F13);
output[49] = Q_REDUCE(F14);
output[57] = Q_REDUCE(F15);
output[ 1] = Q_REDUCE( F[ 8] );
output[ 9] = Q_REDUCE( F[ 9] );
output[17] = Q_REDUCE( F[10] );
output[25] = Q_REDUCE( F[11] );
output[33] = Q_REDUCE( F[12] );
output[41] = Q_REDUCE( F[13] );
output[49] = Q_REDUCE( F[14] );
output[57] = Q_REDUCE( F[15] );
// Iteration 2:
ADD_SUB(F16, F17);
ADD_SUB(F18, F19);
ADD_SUB(F20, F21);
ADD_SUB(F22, F23);
ADD_SUB( F[16], F[17] );
ADD_SUB( F[18], F[19] );
ADD_SUB( F[20], F[21] );
ADD_SUB( F[22], F[23] );
F[19] <<= 4;
F[23] <<= 4;
ADD_SUB( F[16], F[18]);
ADD_SUB( F[17], F[19]);
ADD_SUB( F[20], F[22]);
ADD_SUB( F[21], F[23]);
F[21] <<= 2;
F[22] <<= 4;
F[23] <<= 6;
ADD_SUB( F[16], F[20] );
ADD_SUB( F[17], F[21] );
ADD_SUB( F[18], F[22] );
ADD_SUB( F[19], F[23] );
F19 <<= 4;
F23 <<= 4;
ADD_SUB(F16, F18);
ADD_SUB(F17, F19);
ADD_SUB(F20, F22);
ADD_SUB(F21, F23);
F21 <<= 2;
F22 <<= 4;
F23 <<= 6;
ADD_SUB(F16, F20);
ADD_SUB(F17, F21);
ADD_SUB(F18, F22);
ADD_SUB(F19, F23);
output[2] = Q_REDUCE(F16);
output[10] = Q_REDUCE(F17);
output[18] = Q_REDUCE(F18);
output[26] = Q_REDUCE(F19);
output[34] = Q_REDUCE(F20);
output[42] = Q_REDUCE(F21);
output[50] = Q_REDUCE(F22);
output[58] = Q_REDUCE(F23);
output[ 2] = Q_REDUCE( F[16] );
output[10] = Q_REDUCE( F[17] );
output[18] = Q_REDUCE( F[18] );
output[26] = Q_REDUCE( F[19] );
output[34] = Q_REDUCE( F[20] );
output[42] = Q_REDUCE( F[21] );
output[50] = Q_REDUCE( F[22] );
output[58] = Q_REDUCE( F[23] );
// Iteration 3:
ADD_SUB(F24, F25);
ADD_SUB(F26, F27);
ADD_SUB(F28, F29);
ADD_SUB(F30, F31);
ADD_SUB( F[24], F[25] );
ADD_SUB( F[26], F[27] );
ADD_SUB( F[28], F[29] );
ADD_SUB( F[30], F[31] );
F[27] <<= 4;
F[31] <<= 4;
ADD_SUB( F[24], F[26] );
ADD_SUB( F[25], F[27] );
ADD_SUB( F[28], F[30] );
ADD_SUB( F[29], F[31] );
F[29] <<= 2;
F[30] <<= 4;
F[31] <<= 6;
ADD_SUB( F[24], F[28] );
ADD_SUB( F[25], F[29] );
ADD_SUB( F[26], F[30] );
ADD_SUB( F[27], F[31] );
F27 <<= 4;
F31 <<= 4;
ADD_SUB(F24, F26);
ADD_SUB(F25, F27);
ADD_SUB(F28, F30);
ADD_SUB(F29, F31);
F29 <<= 2;
F30 <<= 4;
F31 <<= 6;
ADD_SUB(F24, F28);
ADD_SUB(F25, F29);
ADD_SUB(F26, F30);
ADD_SUB(F27, F31);
output[3] = Q_REDUCE(F24);
output[11] = Q_REDUCE(F25);
output[19] = Q_REDUCE(F26);
output[27] = Q_REDUCE(F27);
output[35] = Q_REDUCE(F28);
output[43] = Q_REDUCE(F29);
output[51] = Q_REDUCE(F30);
output[59] = Q_REDUCE(F31);
output[ 3] = Q_REDUCE( F[24] );
output[11] = Q_REDUCE( F[25] );
output[19] = Q_REDUCE( F[26] );
output[27] = Q_REDUCE( F[27] );
output[35] = Q_REDUCE( F[28] );
output[43] = Q_REDUCE( F[29] );
output[51] = Q_REDUCE( F[30] );
output[59] = Q_REDUCE( F[31] );
// Iteration 4:
ADD_SUB(F32, F33);
ADD_SUB(F34, F35);
ADD_SUB(F36, F37);
ADD_SUB(F38, F39);
ADD_SUB( F[32], F[33] );
ADD_SUB( F[34], F[35] );
ADD_SUB( F[36], F[37] );
ADD_SUB( F[38], F[39] );
F[35] <<= 4;
F[39] <<= 4;
ADD_SUB( F[32], F[34] );
ADD_SUB( F[33], F[35] );
ADD_SUB( F[36], F[38] );
ADD_SUB( F[37], F[39] );
F[37] <<= 2;
F[38] <<= 4;
F[39] <<= 6;
ADD_SUB( F[32], F[36] );
ADD_SUB( F[33], F[37] );
ADD_SUB( F[34], F[38] );
ADD_SUB( F[35], F[39] );
F35 <<= 4;
F39 <<= 4;
ADD_SUB(F32, F34);
ADD_SUB(F33, F35);
ADD_SUB(F36, F38);
ADD_SUB(F37, F39);
F37 <<= 2;
F38 <<= 4;
F39 <<= 6;
ADD_SUB(F32, F36);
ADD_SUB(F33, F37);
ADD_SUB(F34, F38);
ADD_SUB(F35, F39);
output[4] = Q_REDUCE(F32);
output[12] = Q_REDUCE(F33);
output[20] = Q_REDUCE(F34);
output[28] = Q_REDUCE(F35);
output[36] = Q_REDUCE(F36);
output[44] = Q_REDUCE(F37);
output[52] = Q_REDUCE(F38);
output[60] = Q_REDUCE(F39);
output[ 4] = Q_REDUCE( F[32] );
output[12] = Q_REDUCE( F[33] );
output[20] = Q_REDUCE( F[34] );
output[28] = Q_REDUCE( F[35] );
output[36] = Q_REDUCE( F[36] );
output[44] = Q_REDUCE( F[37] );
output[52] = Q_REDUCE( F[38] );
output[60] = Q_REDUCE( F[39] );
// Iteration 5:
ADD_SUB(F40, F41);
ADD_SUB(F42, F43);
ADD_SUB(F44, F45);
ADD_SUB(F46, F47);
ADD_SUB( F[40], F[41] );
ADD_SUB( F[42], F[43] );
ADD_SUB( F[44], F[45] );
ADD_SUB( F[46], F[47] );
F[43] <<= 4;
F[47] <<= 4;
ADD_SUB( F[40], F[42] );
ADD_SUB( F[41], F[43] );
ADD_SUB( F[44], F[46] );
ADD_SUB( F[45], F[47] );
F[45] <<= 2;
F[46] <<= 4;
F[47] <<= 6;
ADD_SUB( F[40], F[44] );
ADD_SUB( F[41], F[45] );
ADD_SUB( F[42], F[46] );
ADD_SUB( F[43], F[47] );
F43 <<= 4;
F47 <<= 4;
ADD_SUB(F40, F42);
ADD_SUB(F41, F43);
ADD_SUB(F44, F46);
ADD_SUB(F45, F47);
F45 <<= 2;
F46 <<= 4;
F47 <<= 6;
ADD_SUB(F40, F44);
ADD_SUB(F41, F45);
ADD_SUB(F42, F46);
ADD_SUB(F43, F47);
output[5] = Q_REDUCE(F40);
output[13] = Q_REDUCE(F41);
output[21] = Q_REDUCE(F42);
output[29] = Q_REDUCE(F43);
output[37] = Q_REDUCE(F44);
output[45] = Q_REDUCE(F45);
output[53] = Q_REDUCE(F46);
output[61] = Q_REDUCE(F47);
output[ 5] = Q_REDUCE( F[40] );
output[13] = Q_REDUCE( F[41] );
output[21] = Q_REDUCE( F[42] );
output[29] = Q_REDUCE( F[43] );
output[37] = Q_REDUCE( F[44] );
output[45] = Q_REDUCE( F[45] );
output[53] = Q_REDUCE( F[46] );
output[61] = Q_REDUCE( F[47] );
// Iteration 6:
ADD_SUB(F48, F49);
ADD_SUB(F50, F51);
ADD_SUB(F52, F53);
ADD_SUB(F54, F55);
ADD_SUB( F[48], F[49] );
ADD_SUB( F[50], F[51] );
ADD_SUB( F[52], F[53] );
ADD_SUB( F[54], F[55] );
F[51] <<= 4;
F[55] <<= 4;
ADD_SUB( F[48], F[50] );
ADD_SUB( F[49], F[51] );
ADD_SUB( F[52], F[54] );
ADD_SUB( F[53], F[55] );
F[53] <<= 2;
F[54] <<= 4;
F[55] <<= 6;
ADD_SUB( F[48], F[52] );
ADD_SUB( F[49], F[53] );
ADD_SUB( F[50], F[54] );
ADD_SUB( F[51], F[55] );
F51 <<= 4;
F55 <<= 4;
ADD_SUB(F48, F50);
ADD_SUB(F49, F51);
ADD_SUB(F52, F54);
ADD_SUB(F53, F55);
F53 <<= 2;
F54 <<= 4;
F55 <<= 6;
ADD_SUB(F48, F52);
ADD_SUB(F49, F53);
ADD_SUB(F50, F54);
ADD_SUB(F51, F55);
output[6] = Q_REDUCE(F48);
output[14] = Q_REDUCE(F49);
output[22] = Q_REDUCE(F50);
output[30] = Q_REDUCE(F51);
output[38] = Q_REDUCE(F52);
output[46] = Q_REDUCE(F53);
output[54] = Q_REDUCE(F54);
output[62] = Q_REDUCE(F55);
output[ 6] = Q_REDUCE( F[48] );
output[14] = Q_REDUCE( F[49] );
output[22] = Q_REDUCE( F[50] );
output[30] = Q_REDUCE( F[51] );
output[38] = Q_REDUCE( F[52] );
output[46] = Q_REDUCE( F[53] );
output[54] = Q_REDUCE( F[54] );
output[62] = Q_REDUCE( F[55] );
// Iteration 7:
ADD_SUB(F56, F57);
ADD_SUB(F58, F59);
ADD_SUB(F60, F61);
ADD_SUB(F62, F63);
ADD_SUB( F[56], F[57] );
ADD_SUB( F[58], F[59] );
ADD_SUB( F[60], F[61] );
ADD_SUB( F[62], F[63] );
F[59] <<= 4;
F[63] <<= 4;
ADD_SUB( F[56], F[58] );
ADD_SUB( F[57], F[59] );
ADD_SUB( F[60], F[62] );
ADD_SUB( F[61], F[63] );
F[61] <<= 2;
F[62] <<= 4;
F[63] <<= 6;
ADD_SUB( F[56], F[60] );
ADD_SUB( F[57], F[61] );
ADD_SUB( F[58], F[62] );
ADD_SUB( F[59], F[63] );
F59 <<= 4;
F63 <<= 4;
ADD_SUB(F56, F58);
ADD_SUB(F57, F59);
ADD_SUB(F60, F62);
ADD_SUB(F61, F63);
F61 <<= 2;
F62 <<= 4;
F63 <<= 6;
ADD_SUB(F56, F60);
ADD_SUB(F57, F61);
ADD_SUB(F58, F62);
ADD_SUB(F59, F63);
output[7] = Q_REDUCE(F56);
output[15] = Q_REDUCE(F57);
output[23] = Q_REDUCE(F58);
output[31] = Q_REDUCE(F59);
output[39] = Q_REDUCE(F60);
output[47] = Q_REDUCE(F61);
output[55] = Q_REDUCE(F62);
output[63] = Q_REDUCE(F63);
output[ 7] = Q_REDUCE( F[56] );
output[15] = Q_REDUCE( F[57] );
output[23] = Q_REDUCE( F[58] );
output[31] = Q_REDUCE( F[59] );
output[39] = Q_REDUCE( F[60] );
output[47] = Q_REDUCE( F[61] );
output[55] = Q_REDUCE( F[62] );
output[63] = Q_REDUCE( F[63] );
#undef ADD_SUB
#undef Q_REDUCE

View File

@@ -134,10 +134,10 @@ int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len )
int sha3_4way_final( void *md, sha3_4way_ctx_t *c )
{
c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ],
m256_const1_64( 6 ) );
_mm256_set1_epi64x( 6 ) );
c->st[ c->rsiz / 8 - 1 ] =
_mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ],
m256_const1_64( 0x8000000000000000 ) );
_mm256_set1_epi64x( 0x8000000000000000 ) );
sha3_4way_keccakf( c->st );
memcpy( md, c->st, c->mdlen * 4 );
return 1;
@@ -268,10 +268,10 @@ int sha3_8way_final( void *md, sha3_8way_ctx_t *c )
{
c->st[ c->pt ] =
_mm512_xor_si512( c->st[ c->pt ],
m512_const1_64( 6 ) );
_mm512_set1_epi64( 6 ) );
c->st[ c->rsiz / 8 - 1 ] =
_mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ],
m512_const1_64( 0x8000000000000000 ) );
_mm512_set1_epi64( 0x8000000000000000 ) );
sha3_8way_keccakf( c->st );
memcpy( md, c->st, c->mdlen * 8 );
return 1;

View File

@@ -201,7 +201,7 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m512i eight = m512_const1_64( 8 );
const __m512i eight = _mm512_set1_epi64( 8 );
const bool bench = opt_benchmark;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
@@ -369,7 +369,7 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );

View File

@@ -114,7 +114,7 @@ int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n +=8;
} while ( likely( ( n < last_nonce ) && !( *restart ) ) );
pdata[19] = n;
@@ -218,7 +218,7 @@ int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n +=4;
} while ( likely( ( n < last_nonce ) && !( *restart ) ) );
pdata[19] = n;

View File

@@ -536,7 +536,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -963,7 +963,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;

View File

@@ -49,7 +49,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -102,7 +102,7 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( ( n < last_nonce ) && !(*restart) );
pdata[19] = n;

View File

@@ -26,7 +26,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
x16rt_getTimeHash( masked_ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
s_ntime = masked_ntime;
if ( opt_debug && !thr_id )
if ( !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
x16r_hash_order, swab32( pdata[17] ), timeHash );
}

View File

@@ -658,7 +658,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -1143,7 +1143,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;

View File

@@ -181,7 +181,7 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;
@@ -335,7 +335,7 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !(*restart) ) );
pdata[19] = n;

View File

@@ -254,7 +254,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m512i eight = m512_const1_64( 8 );
const __m512i eight = _mm512_set1_epi64( 8 );
const bool bench = opt_benchmark;
// convert LE32 to LE64
@@ -468,7 +468,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32_d7 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
// convert LE32 to LE64

View File

@@ -445,7 +445,7 @@ int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -494,7 +494,7 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
}
}
*noncev = _mm512_add_epi32( *noncev,
m512_const1_64( 0x0000000800000000 ) );
_mm512_set1_epi64( 0x0000000800000000 ) );
n += 8;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -787,7 +787,7 @@ int scanhash_x22i_4way_sha( struct work* work, uint32_t max_nonce,
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;
@@ -835,7 +835,7 @@ int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,
}
}
*noncev = _mm256_add_epi32( *noncev,
m256_const1_64( 0x0000000400000000 ) );
_mm256_set1_epi64x( 0x0000000400000000 ) );
n += 4;
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
pdata[19] = n;

View File

@@ -571,7 +571,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const bool bench = opt_benchmark;
const __m512i eight = m512_const1_64( 8 );
const __m512i eight = _mm512_set1_epi64( 8 );
if ( bench ) ptarget[7] = 0x08ff;
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
@@ -927,7 +927,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const uint32_t targ32 = ptarget[7];
const __m256i four = m256_const1_64( 4 );
const __m256i four = _mm256_set1_epi64x( 4 );
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x08ff;