diff --git a/Makefile.am b/Makefile.am index 35b5b07..a336089 100644 --- a/Makefile.am +++ b/Makefile.am @@ -43,6 +43,7 @@ cpuminer_SOURCES = \ sysinfos.c \ algo-gate-api.c\ malloc-huge.c \ + simd-utils/simd-constants.c \ algo/argon2d/argon2d-gate.c \ algo/argon2d/blake2/blake2b.c \ algo/argon2d/argon2d/argon2.c \ diff --git a/README.md b/README.md index 6236130..3b4daaf 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ time. Android, IOS and alt OSs like Haiku & ReactOS are not supported. 3. Stratum pool supporting stratum+tcp:// or stratum+ssl:// protocols or -RPC getblockte,plate using http:// or https://. +RPC getblocktemplate using http:// or https://. Supported Algorithms -------------------- diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 283ed43..ae8ff13 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,10 +75,16 @@ If not what makes it happen or not happen? Change Log ---------- +v25.4 + +x86_64: improved handling of vector constants used for byte permutations. +x86_64: removed hooks for cancelled AVX10-256. +Minor bug fixes & improvements. +More code cleanup. + v25.3 #442, #443: Fixed a regression in Makefile.am. -Updated dockerfile. Removed algo features log display. Some code cleanup. @@ -87,7 +93,7 @@ v25.2 ARM: Fixed regression from v25.1 that could cause build fail. BSD: FreeBSD is now supported. Other BSDs may also work. MacOS: build with installed jansson library instead of compiling the included source code. -Windows: remove "_WIN32_WINNT=0x0601" which is a downgrade on Win11. +Windows: remove "_WIN32_WINNT=0x0601" which was a downgrade on Win11. Changed build.sh shell from bash to sh. v25.1 diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c index 7662bec..d93c368 100644 --- a/algo/blake/blake-4way.c +++ b/algo/blake/blake-4way.c @@ -6,15 +6,15 @@ #if defined (BLAKE_4WAY) -blake256r14_4way_context blake_4w_ctx; +blake256r14_4x32_context blake_4w_ctx; void blakehash_4way(void *state, const void *input) { uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake256r14_4way_context ctx; + blake256r14_4x32_context ctx; memcpy( &ctx, &blake_4w_ctx, sizeof ctx ); - blake256r14_4way_update( &ctx, input + (64<<2), 16 ); - blake256r14_4way_close( &ctx, vhash ); + blake256r14_4x32_update( &ctx, input + (64<<2), 16 ); + blake256r14_4x32_close( &ctx, vhash ); dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); } @@ -35,8 +35,8 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce, HTarget = 0x7f; v128_bswap32_intrlv80_4x32( vdata, pdata ); - blake256r14_4way_init( &blake_4w_ctx ); - blake256r14_4way_update( &blake_4w_ctx, vdata, 64 ); + blake256r14_4x32_init( &blake_4w_ctx ); + blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 ); do { *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) ); @@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce, #if defined(BLAKE_8WAY) -blake256r14_8way_context blake_8w_ctx; +blake256r14_8x32_context blake_8w_ctx; void blakehash_8way( void *state, const void *input ) { uint32_t vhash[8*8] __attribute__ ((aligned (64))); - blake256r14_8way_context ctx; + blake256r14_8x32_context ctx; memcpy( &ctx, &blake_8w_ctx, sizeof ctx ); - blake256r14_8way( &ctx, input + (64<<3), 16 ); - blake256r14_8way_close( &ctx, vhash ); + blake256r14_8x32( &ctx, input + (64<<3), 16 ); + blake256r14_8x32_close( &ctx, vhash ); _dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128, state+160, state+192, state+224, vhash, 256 ); @@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce, mm256_bswap32_intrlv80_8x32( vdata, pdata ); - blake256r14_8way_init( &blake_8w_ctx ); - blake256r14_8way( &blake_8w_ctx, vdata, 64 ); + blake256r14_8x32_init( &blake_8w_ctx ); + blake256r14_8x32( &blake_8w_ctx, vdata, 64 ); do { *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4, diff --git a/algo/blake/blake256-hash.c b/algo/blake/blake256-hash.c index 066e8e7..4e1a43b 100644 --- a/algo/blake/blake256-hash.c +++ b/algo/blake/blake256-hash.c @@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, (state)->T1 = T1; \ } while (0) - -#if defined(__SSSE3__) - -#define BLAKE256_4X32_BLOCK_BSWAP32 \ -{ \ - v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203 ); \ - M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \ - M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \ - M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \ - M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \ - M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \ - M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \ - M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \ - M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \ - M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \ - M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \ - MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \ - MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \ - MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \ - MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \ - ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \ - MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \ -} - -#else // SSE2 - #define BLAKE256_4X32_BLOCK_BSWAP32 \ { \ M0 = v128_bswap32( buf[0] ); \ @@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, MF = v128_bswap32( buf[15] ); \ } -#endif // SSSE3 else SSE2 - #define COMPRESS32_4X32( rounds ) \ { \ v128_t M0, M1, M2, M3, M4, M5, M6, M7; \ @@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate, ROUND_S_4X32_3; } -#if defined(__SSSE3__) - - const v128_t shuf_bswap32 = - v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); - - H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 ); - H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 ); - H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 ); - H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 ); - H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 ); - H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 ); - H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 ); - H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 ); - -#else - H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) ); H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) ); H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) ); @@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate, H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) ); H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) ); H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) ); - -#endif } #if defined (__AVX2__) @@ -1291,24 +1244,22 @@ do { \ VD = v256_32( T0 ^ 0x299F31D0 ); \ VE = v256_32( T1 ^ 0x082EFA98 ); \ VF = v256_32( T1 ^ 0xEC4E6C89 ); \ - const __m256i shuf_bswap32 = mm256_set2_64( \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \ - M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \ - M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \ - M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \ - M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \ - M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \ - M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \ - M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \ - M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \ - M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \ - MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \ - MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \ - MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \ - MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \ - ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \ - MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \ + M0 = mm256_bswap_32( * buf ); \ + M1 = mm256_bswap_32( *(buf+ 1) ); \ + M2 = mm256_bswap_32( *(buf+ 2) ); \ + M3 = mm256_bswap_32( *(buf+ 3) ); \ + M4 = mm256_bswap_32( *(buf+ 4) ); \ + M5 = mm256_bswap_32( *(buf+ 5) ); \ + M6 = mm256_bswap_32( *(buf+ 6) ); \ + M7 = mm256_bswap_32( *(buf+ 7) ); \ + M8 = mm256_bswap_32( *(buf+ 8) ); \ + M9 = mm256_bswap_32( *(buf+ 9) ); \ + MA = mm256_bswap_32( *(buf+10) ); \ + MB = mm256_bswap_32( *(buf+11) ); \ + MC = mm256_bswap_32( *(buf+12) ); \ + MD = mm256_bswap_32( *(buf+13) ); \ + ME = mm256_bswap_32( *(buf+14) ); \ + MF = mm256_bswap_32( *(buf+15) ); \ ROUND_S_8WAY(0); \ ROUND_S_8WAY(1); \ ROUND_S_8WAY(2); \ @@ -1401,7 +1352,7 @@ do { \ H7 = mm256_xor3( VF, V7, H7 ); \ } -void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, +void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash, void *data ) { __m256i *M = (__m256i*)data; @@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, _mm256_xor_si256( v256_32( CSE ), M[15] ) ); } -void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, +void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate, const void *midhash, const void *data, const int rounds ) { __m256i *H = (__m256i*)final_hash; @@ -1596,17 +1547,14 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, ROUND256_8WAY_3; } - const __m256i shuf_bswap32 = - mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); - - H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 ); - H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 ); - H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 ); - H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 ); - H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 ); - H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 ); - H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 ); - H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 ); + H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) ); + H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) ); + H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) ); + H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) ); + H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) ); + H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) ); + H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) ); + H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) ); } #endif @@ -1933,8 +1881,6 @@ do { \ __m512i M8, M9, MA, MB, MC, MD, ME, MF; \ __m512i V0, V1, V2, V3, V4, V5, V6, V7; \ __m512i V8, V9, VA, VB, VC, VD, VE, VF; \ - const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \ V0 = H0; \ V1 = H1; \ V2 = H2; \ @@ -1951,22 +1897,22 @@ do { \ VD = v512_32( T0 ^ 0x299F31D0 ); \ VE = v512_32( T1 ^ 0x082EFA98 ); \ VF = v512_32( T1 ^ 0xEC4E6C89 ); \ - M0 = _mm512_shuffle_epi8( * buf , shuf_bswap32 ); \ - M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \ - M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \ - M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \ - M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \ - M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \ - M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \ - M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \ - M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \ - M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \ - MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \ - MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \ - MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \ - MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \ - ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \ - MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \ + M0 = mm512_bswap_32( * buf ); \ + M1 = mm512_bswap_32( *(buf+ 1) ); \ + M2 = mm512_bswap_32( *(buf+ 2) ); \ + M3 = mm512_bswap_32( *(buf+ 3) ); \ + M4 = mm512_bswap_32( *(buf+ 4) ); \ + M5 = mm512_bswap_32( *(buf+ 5) ); \ + M6 = mm512_bswap_32( *(buf+ 6) ); \ + M7 = mm512_bswap_32( *(buf+ 7) ); \ + M8 = mm512_bswap_32( *(buf+ 8) ); \ + M9 = mm512_bswap_32( *(buf+ 9) ); \ + MA = mm512_bswap_32( *(buf+10) ); \ + MB = mm512_bswap_32( *(buf+11) ); \ + MC = mm512_bswap_32( *(buf+12) ); \ + MD = mm512_bswap_32( *(buf+13) ); \ + ME = mm512_bswap_32( *(buf+14) ); \ + MF = mm512_bswap_32( *(buf+15) ); \ ROUND_S_16WAY(0); \ ROUND_S_16WAY(1); \ ROUND_S_16WAY(2); \ @@ -2063,7 +2009,7 @@ do { \ // is constant for every nonce and only needs to be run once per job. The // second part is run for each nonce using the precalculated midstate and the // hash from the first block. -void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, +void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash, void *data ) { __m512i *M = (__m512i*)data; @@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, } // Dfault is 14 rounds, blakecoin & vanilla are 8. -void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, +void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate, const void *midhash, const void *data, const int rounds ) { __m512i *H = (__m512i*)final_hash; @@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, } // Byte swap final hash - const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); - H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 ); - H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 ); - H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 ); - H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 ); - H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 ); - H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 ); - H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 ); - H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 ); + H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) ); + H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) ); + H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) ); + H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) ); + H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) ); + H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) ); + H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) ); + H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) ); } #endif // Blake-256 4 way -static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 }; - static void blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv, - const uint32_t *salt, int rounds ) + int rounds ) { casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 ); casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 ); @@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n, // Blake-256 8 way -static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static void -blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv, - const uint32_t *salt, int rounds ) +blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv, + int rounds ) { casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 ); casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 ); @@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv, } static void -blake32_8way( blake_8way_small_context *sc, const void *data, size_t len ) +blake32_8way( blake256_8x32_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; __m256i *buf; @@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len ) } static void -blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, +blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32 ) { __m256i buf[16]; @@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, } static void -blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len ) +blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; __m256i *buf; @@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len ) } static void -blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n, +blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32 ) { __m256i buf[16]; @@ -2622,8 +2563,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n, //Blake-256 16 way AVX512 static void -blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv, - const uint32_t *salt, int rounds ) +blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv, + int rounds ) { casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 ); casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 ); @@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv, } static void -blake32_16way( blake_16way_small_context *sc, const void *data, size_t len ) +blake32_16way( blake256_16x32_context *sc, const void *data, size_t len ) { __m512i *vdata = (__m512i*)data; __m512i *buf; @@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len ) sc->ptr = ptr; } static void -blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n, +blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32 ) { __m512i buf[16]; @@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n, } static void -blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len ) +blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len ) { __m512i *vdata = (__m512i*)data; __m512i *buf; @@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len ) } static void -blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n, +blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32 ) { __m512i buf[16]; @@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n, } void -blake256_16way_init(void *cc) +blake256_16x32_init(void *cc) { - blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 ); + blake32_16way_init( cc, IV256, 14 ); } void -blake256_16way_update(void *cc, const void *data, size_t len) +blake256_16x32_update(void *cc, const void *data, size_t len) { blake32_16way(cc, data, len); } void -blake256_16way_close(void *cc, void *dst) +blake256_16x32_close(void *cc, void *dst) { blake32_16way_close(cc, 0, 0, dst, 8); } void -blake256_16way_update_le(void *cc, const void *data, size_t len) +blake256_16x32_update_le(void *cc, const void *data, size_t len) { blake32_16way_le(cc, data, len); } void -blake256_16way_close_le(void *cc, void *dst) +blake256_16x32_close_le(void *cc, void *dst) { blake32_16way_close_le(cc, 0, 0, dst, 8); } void blake256r14_16way_init(void *cc) { - blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 ); + blake32_16way_init( cc, IV256, 14 ); } void -blake256r14_16way_update(void *cc, const void *data, size_t len) +blake256r14_16x32_update(void *cc, const void *data, size_t len) { blake32_16way(cc, data, len); } void -blake256r14_16way_close(void *cc, void *dst) +blake256r14_16x32_close(void *cc, void *dst) { blake32_16way_close(cc, 0, 0, dst, 8); } void blake256r8_16way_init(void *cc) { - blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 ); + blake32_16way_init( cc, IV256, 8 ); } void -blake256r8_16way_update(void *cc, const void *data, size_t len) +blake256r8_16x32_update(void *cc, const void *data, size_t len) { blake32_16way(cc, data, len); } void -blake256r8_16way_close(void *cc, void *dst) +blake256r8_16x32_close(void *cc, void *dst) { blake32_16way_close(cc, 0, 0, dst, 8); } @@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst) void blake256_4x32_init(void *ctx) { - blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 ); + blake32_4x32_init( ctx, IV256, 14 ); } void @@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst) // Blake-256 8 way void -blake256_8way_init(void *cc) +blake256_8x32_init(void *cc) { - blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 ); + blake32_8way_init( cc, IV256, 14 ); } void -blake256_8way_update(void *cc, const void *data, size_t len) +blake256_8x32_update(void *cc, const void *data, size_t len) { blake32_8way(cc, data, len); } void -blake256_8way_close(void *cc, void *dst) +blake256_8x32_close(void *cc, void *dst) { blake32_8way_close(cc, 0, 0, dst, 8); } void -blake256_8way_update_le(void *cc, const void *data, size_t len) +blake256_8x32_update_le(void *cc, const void *data, size_t len) { blake32_8way_le(cc, data, len); } void -blake256_8way_close_le(void *cc, void *dst) +blake256_8x32_close_le(void *cc, void *dst) { blake32_8way_close_le(cc, 0, 0, dst, 8); } @@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst) // 14 rounds Blake, Decred void blake256r14_4x32_init(void *cc) { - blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 ); + blake32_4x32_init( cc, IV256, 14 ); } void @@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst) #if defined(__AVX2__) -void blake256r14_8way_init(void *cc) +void blake256r14_8x32_init(void *cc) { - blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 ); + blake32_8way_init( cc, IV256, 14 ); } void -blake256r14_8way_update(void *cc, const void *data, size_t len) +blake256r14_8x32_update(void *cc, const void *data, size_t len) { blake32_8way(cc, data, len); } void -blake256r14_8way_close(void *cc, void *dst) +blake256r14_8x32_close(void *cc, void *dst) { blake32_8way_close(cc, 0, 0, dst, 8); } @@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst) // 8 rounds Blakecoin, Vanilla void blake256r8_4x32_init(void *cc) { - blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 ); + blake32_4x32_init( cc, IV256, 8 ); } void @@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst) #if defined (__AVX2__) -void blake256r8_8way_init(void *cc) +void blake256r8_8x32_init(void *cc) { - blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 ); + blake32_8way_init( cc, IV256, 8 ); } void -blake256r8_8way_update(void *cc, const void *data, size_t len) +blake256r8_8x32_update(void *cc, const void *data, size_t len) { blake32_8way(cc, data, len); } void -blake256r8_8way_close(void *cc, void *dst) +blake256r8_8x32_close(void *cc, void *dst) { blake32_8way_close(cc, 0, 0, dst, 8); } diff --git a/algo/blake/blake256-hash.h b/algo/blake/blake256-hash.h index 6b53ef5..b310ba2 100644 --- a/algo/blake/blake256-hash.h +++ b/algo/blake/blake256-hash.h @@ -29,13 +29,6 @@ typedef struct void blake256_transform_le( uint32_t *H, const uint32_t *buf, const uint32_t T0, const uint32_t T1, int rounds ); -/* -void blake256_init( blake256_context *sc ); -void blake256_update( blake256_context *sc, const void *data, size_t len ); -void blake256_close( blake256_context *sc, void *dst ); -void blake256_full( blake256_context *sc, void *dst, const void *data, - size_t len ); -*/ ////////////////////////////////// // @@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context; void blake256_4x32_init(void *ctx); void blake256_4x32_update(void *ctx, const void *data, size_t len); void blake256_4x32_close(void *ctx, void *dst); +void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash, + void *data ); +void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate, + const void *midhash, const void *data, const int rounds ); // 14 rounds typedef blake_4x32_small_context blake256r14_4x32_context; @@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc); void blake256r8_4x32_update(void *cc, const void *data, size_t len); void blake256r8_4x32_close(void *cc, void *dst); -void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash, - void *data ); -void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate, - const void *midhash, const void *data, const int rounds ); - -#define blake_4way_small_context blake256_4x32_context -#define blake256_4way_context blake256_4x32_context -#define blake256_4way_init blake256_4x32_init -#define blake256_4way_update blake256_4x32_update -#define blake256_4way_close blake256_4x32_close -#define blake256_4way_update_le blake256_4x32_update_le -#define blake256_4way_close_le blake256_4x32_close_le -#define blake256_4way_round0_prehash_le blake256_4x32_round0_prehash_le -#define blake256_4way_final_rounds_le blake256_4x32_final_rounds_le -#define blake256r14_4way_context blake256r14_4x32_context -#define blake256r14_4way_init blake256r14_4x32_init -#define blake256r14_4way_update blake256r14_4x32_update -#define blake256r14_4way_close blake256r14_4x32_close -#define blake256r8_4way_context blake256r14_4x32_context -#define blake256r8_4way_init blake256r14_4x32_init -#define blake256r8_4way_update blake256r14_4x32_update -#define blake256r8_4way_close blake256r14_4x32_close - #ifdef __AVX2__ ////////////////////////////// @@ -107,45 +81,28 @@ typedef struct } blake_8way_small_context; // Default 14 rounds -typedef blake_8way_small_context blake256_8way_context; -void blake256_8way_init(void *cc); -void blake256_8way_update(void *cc, const void *data, size_t len); -void blake256_8way_close(void *cc, void *dst); -void blake256_8way_update_le(void *cc, const void *data, size_t len); -void blake256_8way_close_le(void *cc, void *dst); -void blake256_8way_round0_prehash_le( void *midstate, const void *midhash, +typedef blake_8way_small_context blake256_8x32_context; +void blake256_8x32_init(void *cc); +void blake256_8x32_update(void *cc, const void *data, size_t len); +void blake256_8x32_close(void *cc, void *dst); +void blake256_8x32_update_le(void *cc, const void *data, size_t len); +void blake256_8x32_close_le(void *cc, void *dst); +void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash, void *data ); -void blake256_8way_final_rounds_le( void *final_hash, const void *midstate, +void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate, const void *midhash, const void *data, const int rounds ); // 14 rounds, blake, decred -typedef blake_8way_small_context blake256r14_8way_context; -void blake256r14_8way_init(void *cc); -void blake256r14_8way_update(void *cc, const void *data, size_t len); -void blake256r14_8way_close(void *cc, void *dst); +typedef blake_8way_small_context blake256r14_8x32_context; +void blake256r14_8x32_init(void *cc); +void blake256r14_8x32_update(void *cc, const void *data, size_t len); +void blake256r14_8x32_close(void *cc, void *dst); // 8 rounds, blakecoin, vanilla -typedef blake_8way_small_context blake256r8_8way_context; -void blake256r8_8way_init(void *cc); -void blake256r8_8way_update(void *cc, const void *data, size_t len); -void blake256r8_8way_close(void *cc, void *dst); - -#define blake_8x32_small_context blake256_8way_context -#define blake_8x32_init blake256_8way_init -#define blake_8x32_update blake256_8way_update -#define blake_8x32_close blake256_8way_close -#define blake_8x32_update_le blake256_8way_update_le -#define blake_8x32_close_le blake256_8way_close_le -#define blake_8x32_round0_prehash_le blake256_8way_round0_prehash -#define blake_8x32_final_rounds_le blake256_8way_final_rounds_le -#define blake256r14_8x32_context blake256r14_8way_context -#define blake256r14_8x32_init blake256r14_8way_init -#define blake256r14_8x32_update blake256r14_8way_update -#define blake256r14_8x32_close blake256r14_8way_close -#define blake256r8_8x32_context blake256r14_8way_context -#define blake256r8_8x32_init blake256r14_8way_init -#define blake256r8_8x32_update blake256r14_8way_update -#define blake256r8_8x32_close blake256r14_8way_close +typedef blake_8way_small_context blake256r8_8x32_context; +void blake256r8_8x32_init(void *cc); +void blake256r8_8x32_update(void *cc, const void *data, size_t len); +void blake256r8_8x32_close(void *cc, void *dst); #if defined(SIMD512) @@ -163,46 +120,29 @@ typedef struct } blake_16way_small_context __attribute__ ((aligned (128))); // Default 14 rounds -typedef blake_16way_small_context blake256_16way_context; -void blake256_16way_init(void *cc); -void blake256_16way_update(void *cc, const void *data, size_t len); -void blake256_16way_close(void *cc, void *dst); +typedef blake_16way_small_context blake256_16x32_context; +void blake256_16x32_init(void *cc); +void blake256_16x32_update(void *cc, const void *data, size_t len); +void blake256_16x32_close(void *cc, void *dst); // Expects data in little endian order, no byte swap needed -void blake256_16way_update_le(void *cc, const void *data, size_t len); -void blake256_16way_close_le(void *cc, void *dst); -void blake256_16way_round0_prehash_le( void *midstate, const void *midhash, +void blake256_16x32_update_le(void *cc, const void *data, size_t len); +void blake256_16x32_close_le(void *cc, void *dst); +void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash, void *data ); -void blake256_16way_final_rounds_le( void *final_hash, const void *midstate, +void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate, const void *midhash, const void *data, const int rounds ); // 14 rounds, blake, decred -typedef blake_16way_small_context blake256r14_16way_context; -void blake256r14_16way_init(void *cc); -void blake256r14_16way_update(void *cc, const void *data, size_t len); -void blake256r14_16way_close(void *cc, void *dst); +typedef blake_16way_small_context blake256r14_16x32_context; +void blake256r14_16x32_init(void *cc); +void blake256r14_16x32_update(void *cc, const void *data, size_t len); +void blake256r14_16x32_close(void *cc, void *dst); // 8 rounds, blakecoin, vanilla -typedef blake_16way_small_context blake256r8_16way_context; -void blake256r8_16way_init(void *cc); -void blake256r8_16way_update(void *cc, const void *data, size_t len); -void blake256r8_16way_close(void *cc, void *dst); - -#define blake_16x32_small_context blake256_16way_context -#define blake_16x32_init blake256_16way_init -#define blake_16x32_update blake256_16way_update -#define blake_16x32_close blake256_16way_close -#define blake_16x32_update_le blake256_16way_update_le -#define blake_16x32_close_le blake256_16way_close_le -#define blake_16x32_round0_prehash_le blake256_16way_round0_prehash -#define blake_16x32_final_rounds_le blake256_16way_final_rounds_le -#define blake256r14_16x32_context blake256r14_16way_context -#define blake256r14_16x32_init blake256r14_16way_init -#define blake256r14_16x32_update blake256r14_16way_update -#define blake256r14_16x32_close blake256r14_16way_close -#define blake256r8_16x32_context blake256r8_16way_context -#define blake256r8_16x32_init blake256r8_16way_init -#define blake256r8_16x32_update blake256r8_16way_update -#define blake256r8_16x32_close blake256r8_16way_close +typedef blake_16way_small_context blake256r8_16x32_context; +void blake256r8_16x32_init(void *cc); +void blake256r8_16x32_update(void *cc, const void *data, size_t len); +void blake256r8_16x32_close(void *cc, void *dst); #endif // AVX512 #endif // AVX2 diff --git a/algo/blake/blake2b-hash.h b/algo/blake/blake2b-hash.h index 6caf804..7195b45 100644 --- a/algo/blake/blake2b-hash.h +++ b/algo/blake/blake2b-hash.h @@ -14,7 +14,6 @@ #define ALIGN(x) __attribute__((aligned(x))) #endif - #if defined(SIMD512) typedef struct ALIGN( 64 ) { @@ -30,11 +29,6 @@ void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input, size_t inlen ); void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out ); -#define blake2b_8way_ctx blake2b_8x64_ctx -#define blake2b_8way_init blake2b_8x64_init -#define blake2b_8way_update blake2b_8x64_update -#define blake2b_8way_final blake2b_8x64_final - #endif #if defined(__AVX2__) @@ -53,11 +47,6 @@ void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input, size_t inlen ); void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out ); -#define blake2b_4way_ctx blake2b_4x64_ctx -#define blake2b_4way_init blake2b_4x64_init -#define blake2b_4way_update blake2b_4x64_update -#define blake2b_4way_final blake2b_4x64_final - #endif #endif diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c index e2b6276..0950ee7 100644 --- a/algo/blake/blake2b.c +++ b/algo/blake/blake2b.c @@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce, uint32_t hash[8*8] __attribute__ ((aligned (128)));; uint32_t vdata[20*8] __attribute__ ((aligned (64)));; uint32_t lane_hash[8] __attribute__ ((aligned (64))); - blake2b_8way_ctx ctx __attribute__ ((aligned (64))); + blake2b_8x64_ctx ctx __attribute__ ((aligned (64))); uint32_t *hash7 = &(hash[49]); // 3*16+1 uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce, _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev ); - blake2b_8way_init( &ctx ); - blake2b_8way_update( &ctx, vdata, 80 ); - blake2b_8way_final( &ctx, hash ); + blake2b_8x64_init( &ctx ); + blake2b_8x64_update( &ctx, vdata, 80 ); + blake2b_8x64_final( &ctx, hash ); for ( int lane = 0; lane < 8; lane++ ) if ( hash7[ lane<<1 ] <= Htarg ) @@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce, // Function not used, code inlined. void blake2b_4way_hash(void *output, const void *input) { - blake2b_4way_ctx ctx; - blake2b_4way_init( &ctx ); - blake2b_4way_update( &ctx, input, 80 ); - blake2b_4way_final( &ctx, output ); + blake2b_4x64_ctx ctx; + blake2b_4x64_init( &ctx ); + blake2b_4x64_update( &ctx, input, 80 ); + blake2b_4x64_final( &ctx, output ); } int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce, @@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce, uint32_t hash[8*4] __attribute__ ((aligned (64)));; uint32_t vdata[20*4] __attribute__ ((aligned (32)));; uint32_t lane_hash[8] __attribute__ ((aligned (32))); - blake2b_4way_ctx ctx __attribute__ ((aligned (32))); + blake2b_4x64_ctx ctx __attribute__ ((aligned (32))); uint32_t *hash7 = &(hash[25]); // 3*8+1 uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce, *noncev = mm256_intrlv_blend_32( mm256_bswap_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); - blake2b_4way_init( &ctx ); - blake2b_4way_update( &ctx, vdata, 80 ); - blake2b_4way_final( &ctx, hash ); + blake2b_4x64_init( &ctx ); + blake2b_4x64_update( &ctx, vdata, 80 ); + blake2b_4x64_final( &ctx, hash ); for ( int lane = 0; lane < 4; lane++ ) if ( hash7[ lane<<1 ] <= Htarg ) diff --git a/algo/blake/blake2s-hash.h b/algo/blake/blake2s-hash.h index f77f4a4..3f09f6e 100644 --- a/algo/blake/blake2s-hash.h +++ b/algo/blake/blake2s-hash.h @@ -61,6 +61,11 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen ); int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, const void *input, uint64_t inlen ); +#define blake2s_4x32_state blake2s_4way_state +#define blake2s_4x32_init blake2s_4way_init +#define blake2s_4x32_update blake2s_4way_update +#define blake2s_4x32_final blake2s_4way_final +#define blake2s_4x32_full_blocks blake2s_4way_full_blocks #if defined(__AVX2__) @@ -81,6 +86,12 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen ); int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out, const void *input, uint64_t inlen ); +#define blake2s_8x32_state blake2s_8way_state +#define blake2s_8x32_init blake2s_8way_init +#define blake2s_8x32_update blake2s_8way_update +#define blake2s_8x32_final blake2s_8way_final +#define blake2s_8x32_full_blocks blake2s_8way_full_blocks + #endif #if defined(SIMD512) @@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in, uint64_t inlen ); int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen ); +#define blake2s_16x32_state blake2s_16way_state +#define blake2s_16x32_init blake2s_16way_init +#define blake2s_16x32_update blake2s_16way_update +#define blake2s_16x32_final blake2s_16way_final + #endif #if 0 diff --git a/algo/blake/blake512-hash.c b/algo/blake/blake512-hash.c index 73799e2..61dd186 100644 --- a/algo/blake/blake512-hash.c +++ b/algo/blake/blake512-hash.c @@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data, VD = v512_64( CB5 ^ T0 ); \ VE = v512_64( CB6 ^ T1 ); \ VF = v512_64( CB7 ^ T1 ); \ - const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \ - M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ - M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \ - M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \ - M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \ - M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \ - M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \ - M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \ - M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \ - M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \ - M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \ - MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \ - MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \ - MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \ - MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \ - ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \ - MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \ + M0 = mm512_bswap_64( *(buf+ 0) ); \ + M1 = mm512_bswap_64( *(buf+ 1) ); \ + M2 = mm512_bswap_64( *(buf+ 2) ); \ + M3 = mm512_bswap_64( *(buf+ 3) ); \ + M4 = mm512_bswap_64( *(buf+ 4) ); \ + M5 = mm512_bswap_64( *(buf+ 5) ); \ + M6 = mm512_bswap_64( *(buf+ 6) ); \ + M7 = mm512_bswap_64( *(buf+ 7) ); \ + M8 = mm512_bswap_64( *(buf+ 8) ); \ + M9 = mm512_bswap_64( *(buf+ 9) ); \ + MA = mm512_bswap_64( *(buf+10) ); \ + MB = mm512_bswap_64( *(buf+11) ); \ + MC = mm512_bswap_64( *(buf+12) ); \ + MD = mm512_bswap_64( *(buf+13) ); \ + ME = mm512_bswap_64( *(buf+14) ); \ + MF = mm512_bswap_64( *(buf+15) ); \ ROUND_B_8WAY(0); \ ROUND_B_8WAY(1); \ ROUND_B_8WAY(2); \ @@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data, H7 = mm512_xor3( VF, V7, H7 ); \ } -void blake512_8way_compress( blake_8way_big_context *sc ) +void blake512_8x64_compress( blake_8x64_big_context *sc ) { __m512i M0, M1, M2, M3, M4, M5, M6, M7; __m512i M8, M9, MA, MB, MC, MD, ME, MF; @@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc ) VE = v512_64( CB6 ^ sc->T1 ); VF = v512_64( CB7 ^ sc->T1 ); - const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); - - M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 ); - M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 ); - M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 ); - M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 ); - M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 ); - M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 ); - M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 ); - M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 ); - M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 ); - M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 ); - MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 ); - MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 ); - MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 ); - MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 ); - ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 ); - MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 ); + M0 = mm512_bswap_64( sc->buf[ 0] ); + M1 = mm512_bswap_64( sc->buf[ 1] ); + M2 = mm512_bswap_64( sc->buf[ 2] ); + M3 = mm512_bswap_64( sc->buf[ 3] ); + M4 = mm512_bswap_64( sc->buf[ 4] ); + M5 = mm512_bswap_64( sc->buf[ 5] ); + M6 = mm512_bswap_64( sc->buf[ 6] ); + M7 = mm512_bswap_64( sc->buf[ 7] ); + M8 = mm512_bswap_64( sc->buf[ 8] ); + M9 = mm512_bswap_64( sc->buf[ 9] ); + MA = mm512_bswap_64( sc->buf[10] ); + MB = mm512_bswap_64( sc->buf[11] ); + MC = mm512_bswap_64( sc->buf[12] ); + MD = mm512_bswap_64( sc->buf[13] ); + ME = mm512_bswap_64( sc->buf[14] ); + MF = mm512_bswap_64( sc->buf[15] ); ROUND_B_8WAY(0); ROUND_B_8WAY(1); @@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc ) } // won't be used after prehash implemented -void blake512_8way_compress_le( blake_8x64_big_context *sc ) +void blake512_8x64_compress_le( blake_8x64_big_context *sc ) { __m512i M0, M1, M2, M3, M4, M5, M6, M7; __m512i M8, M9, MA, MB, MC, MD, ME, MF; @@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst, { if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) sc->T1 = sc->T1 + 1; - blake512_8way_compress( sc ); + blake512_8x64_compress( sc ); sc->ptr = 0; } @@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst, if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) sc->T1 = sc->T1 + 1; - blake512_8way_compress( sc ); + blake512_8x64_compress( sc ); mm512_block_bswap_64( (__m512i*)dst, sc->H ); } @@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst, { if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) sc->T1 = sc->T1 + 1; - blake512_8way_compress_le( sc ); + blake512_8x64_compress_le( sc ); sc->ptr = 0; } @@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst, if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) sc->T1 = sc->T1 + 1; - blake512_8way_compress_le( sc ); + blake512_8x64_compress_le( sc ); mm512_block_bswap_64( (__m512i*)dst, sc->H ); } @@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst) VD = v256_64( CB5 ^ T0 ); \ VE = v256_64( CB6 ^ T1 ); \ VF = v256_64( CB7 ^ T1 ); \ - const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \ - M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ - M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \ - M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \ - M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \ - M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \ - M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \ - M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \ - M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \ - M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \ - M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \ - MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \ - MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \ - MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \ - MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \ - ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \ - MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \ + M0 = mm256_bswap_64( *(buf+ 0) ); \ + M1 = mm256_bswap_64( *(buf+ 1) ); \ + M2 = mm256_bswap_64( *(buf+ 2) ); \ + M3 = mm256_bswap_64( *(buf+ 3) ); \ + M4 = mm256_bswap_64( *(buf+ 4) ); \ + M5 = mm256_bswap_64( *(buf+ 5) ); \ + M6 = mm256_bswap_64( *(buf+ 6) ); \ + M7 = mm256_bswap_64( *(buf+ 7) ); \ + M8 = mm256_bswap_64( *(buf+ 8) ); \ + M9 = mm256_bswap_64( *(buf+ 9) ); \ + MA = mm256_bswap_64( *(buf+10) ); \ + MB = mm256_bswap_64( *(buf+11) ); \ + MC = mm256_bswap_64( *(buf+12) ); \ + MD = mm256_bswap_64( *(buf+13) ); \ + ME = mm256_bswap_64( *(buf+14) ); \ + MF = mm256_bswap_64( *(buf+15) ); \ ROUND_B_4WAY(0); \ ROUND_B_4WAY(1); \ ROUND_B_4WAY(2); \ @@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst) } -void blake512_4way_compress( blake_4x64_big_context *sc ) +void blake512_4x64_compress( blake_4x64_big_context *sc ) { __m256i M0, M1, M2, M3, M4, M5, M6, M7; __m256i M8, M9, MA, MB, MC, MD, ME, MF; @@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc ) VD = v256_64( CB5 ^ sc->T0 ); VE = v256_64( CB6 ^ sc->T1 ); VF = v256_64( CB7 ^ sc->T1 ); - const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); - M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 ); - M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 ); - M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 ); - M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 ); - M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 ); - M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 ); - M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 ); - M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 ); - M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 ); - M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 ); - MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 ); - MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 ); - MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 ); - MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 ); - ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 ); - MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 ); + M0 = mm256_bswap_64( sc->buf[ 0] ); + M1 = mm256_bswap_64( sc->buf[ 1] ); + M2 = mm256_bswap_64( sc->buf[ 2] ); + M3 = mm256_bswap_64( sc->buf[ 3] ); + M4 = mm256_bswap_64( sc->buf[ 4] ); + M5 = mm256_bswap_64( sc->buf[ 5] ); + M6 = mm256_bswap_64( sc->buf[ 6] ); + M7 = mm256_bswap_64( sc->buf[ 7] ); + M8 = mm256_bswap_64( sc->buf[ 8] ); + M9 = mm256_bswap_64( sc->buf[ 9] ); + MA = mm256_bswap_64( sc->buf[10] ); + MB = mm256_bswap_64( sc->buf[11] ); + MC = mm256_bswap_64( sc->buf[12] ); + MD = mm256_bswap_64( sc->buf[13] ); + ME = mm256_bswap_64( sc->buf[14] ); + MF = mm256_bswap_64( sc->buf[15] ); ROUND_B_4WAY(0); ROUND_B_4WAY(1); @@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc ) sc->H[7] = mm256_xor3( VF, V7, sc->H[7] ); } -void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate, +void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate, const void *data ) { __m256i V0, V1, V2, V3, V4, V5, V6, V7; @@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate, midstate[15] = VF; } -void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash, +void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash, const __m256i nonce, const __m256i *midstate ) { __m256i M0, M1, M2, M3, M4, M5, M6, M7; @@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash, } -void blake512_4x64_init( blake_4x64_big_context *sc ) +void blake512_4x64_init( blake512_4x64_context *sc ) { casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 ); casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B ); @@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst ) } // init, update & close -void blake512_4x64_full( blake_4x64_big_context *sc, void * dst, +void blake512_4x64_full( blake512_4x64_context *sc, void * dst, const void *data, size_t len ) { @@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst, { if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) sc->T1 = sc->T1 + 1; - blake512_4way_compress( sc ); + blake512_4x64_compress( sc ); sc->ptr = 0; } @@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst, if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 ) sc->T1 = sc->T1 + 1; - blake512_4way_compress( sc ); + blake512_4x64_compress( sc ); mm256_block_bswap_64( (__m256i*)dst, sc->H ); } @@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc ) VE = v128_64( CB6 ^ sc->T1 ); VF = v128_64( CB7 ^ sc->T1 ); -#if defined(__SSSE3__) - - const v128u64_t shuf_bswap64 = v128_set64( - 0x08090a0b0c0d0e0f, 0x0001020304050607 ); - M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 ); - M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 ); - M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 ); - M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 ); - M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 ); - M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 ); - M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 ); - M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 ); - M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 ); - M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 ); - MA = v128_shuffle8( sc->buf[10], shuf_bswap64 ); - MB = v128_shuffle8( sc->buf[11], shuf_bswap64 ); - MC = v128_shuffle8( sc->buf[12], shuf_bswap64 ); - MD = v128_shuffle8( sc->buf[13], shuf_bswap64 ); - ME = v128_shuffle8( sc->buf[14], shuf_bswap64 ); - MF = v128_shuffle8( sc->buf[15], shuf_bswap64 ); - -#else // SSE2 & NEON - M0 = v128_bswap64( sc->buf[ 0] ); M1 = v128_bswap64( sc->buf[ 1] ); M2 = v128_bswap64( sc->buf[ 2] ); @@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc ) ME = v128_bswap64( sc->buf[14] ); MF = v128_bswap64( sc->buf[15] ); -#endif - ROUND_B_2X64(0); ROUND_B_2X64(1); ROUND_B_2X64(2); diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c index 9d9befa..5af893e 100644 --- a/algo/blake/blakecoin-4way.c +++ b/algo/blake/blakecoin-4way.c @@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); // Partialy prehash second block without touching nonces in block_buf[3]. - blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { - blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash, + blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash, block_buf, rounds ); for ( int lane = 0; lane < 16; lane++ ) if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) @@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce, block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); // Partialy prehash second block without touching nonces in block_buf[3]. - blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { - blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash, + blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash, block_buf, rounds ); for ( int lane = 0; lane < 8; lane++ ) if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) @@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce, #elif defined (BLAKECOIN_4WAY) -blake256r8_4way_context blakecoin_4w_ctx; +blake256r8_4x32_context blakecoin_4w_ctx; void blakecoin_4way_hash(void *state, const void *input) { uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake256r8_4way_context ctx; + blake256r8_4x32_context ctx; memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx ); - blake256r8_4way_update( &ctx, input + (64<<2), 16 ); - blake256r8_4way_close( &ctx, vhash ); + blake256r8_4x32_update( &ctx, input + (64<<2), 16 ); + blake256r8_4x32_close( &ctx, vhash ); dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 ); } @@ -178,8 +178,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce, HTarget = 0x7f; v128_bswap32_intrlv80_4x32( vdata, pdata ); - blake256r8_4way_init( &blakecoin_4w_ctx ); - blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 ); + blake256r8_4x32_init( &blakecoin_4w_ctx ); + blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 ); do { *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) ); diff --git a/algo/blake/pentablake-4way.c b/algo/blake/pentablake-4way.c index 1a09362..83be8d2 100644 --- a/algo/blake/pentablake-4way.c +++ b/algo/blake/pentablake-4way.c @@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input ) uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64))); - blake512_4way_context ctx; + blake512_4x64_context ctx; + blake512_4x64_init( &ctx ); + blake512_4x64_update( &ctx, input, 80 ); + blake512_4x64_close( &ctx, vhash ); - blake512_4way_init( &ctx ); - blake512_4way_update( &ctx, input, 80 ); - blake512_4way_close( &ctx, vhash ); + blake512_4x64_init( &ctx ); + blake512_4x64_update( &ctx, vhash, 64 ); + blake512_4x64_close( &ctx, vhash ); - blake512_4way_init( &ctx ); - blake512_4way_update( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); + blake512_4x64_init( &ctx ); + blake512_4x64_update( &ctx, vhash, 64 ); + blake512_4x64_close( &ctx, vhash ); - blake512_4way_init( &ctx ); - blake512_4way_update( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); + blake512_4x64_init( &ctx ); + blake512_4x64_update( &ctx, vhash, 64 ); + blake512_4x64_close( &ctx, vhash ); - blake512_4way_init( &ctx ); - blake512_4way_update( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); - - blake512_4way_init( &ctx ); - blake512_4way_update( &ctx, vhash, 64 ); - blake512_4way_close( &ctx, vhash ); + blake512_4x64_init( &ctx ); + blake512_4x64_update( &ctx, vhash, 64 ); + blake512_4x64_close( &ctx, vhash ); memcpy( output, hash0, 32 ); memcpy( output+32, hash1, 32 ); diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c index acd2267..7be3a28 100644 --- a/algo/blake/sph-blake2s.c +++ b/algo/blake/sph-blake2s.c @@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] ) v[14] = S->f[0] ^ blake2s_IV[6]; v[15] = S->f[1] ^ blake2s_IV[7]; -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) v128_t *V = (v128_t*)v; @@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] ) V[3] = v128_swap64( V[3] ); \ V[2] = v128_shufll32( V[2] ) - BLAKE2S_ROUND(0); - BLAKE2S_ROUND(1); - BLAKE2S_ROUND(2); - BLAKE2S_ROUND(3); - BLAKE2S_ROUND(4); - BLAKE2S_ROUND(5); - BLAKE2S_ROUND(6); - BLAKE2S_ROUND(7); - BLAKE2S_ROUND(8); - BLAKE2S_ROUND(9); - -#undef BLAKE2S_ROUND - #else #define G(r,i,a,b,c,d) \ @@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] ) b = SPH_ROTR32(b ^ c, 7); \ } while(0) -#define ROUND(r) \ +#define BLAKE2S_ROUND(r) \ do { \ G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ @@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] ) G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ } while(0) - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - #endif + BLAKE2S_ROUND(0); + BLAKE2S_ROUND(1); + BLAKE2S_ROUND(2); + BLAKE2S_ROUND(3); + BLAKE2S_ROUND(4); + BLAKE2S_ROUND(5); + BLAKE2S_ROUND(6); + BLAKE2S_ROUND(7); + BLAKE2S_ROUND(8); + BLAKE2S_ROUND(9); + + for( size_t i = 0; i < 8; ++i ) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; #undef G -#undef ROUND +#undef BLAKE2S_ROUND return 0; } diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h index 4ab3ab0..c23432c 100644 --- a/algo/bmw/bmw-hash-4way.h +++ b/algo/bmw/bmw-hash-4way.h @@ -39,16 +39,14 @@ #include #include "simd-utils.h" -#define SPH_SIZE_bmw256 256 - -#define SPH_SIZE_bmw512 512 - // BMW-256 4 way 32 +#if defined(__SSE2__) || defined(__ARM_NEON) + typedef struct { - v128_t buf[64]; - v128_t H[16]; + v128u32_t buf[64]; + v128u32_t H[16]; size_t ptr; uint32_t bit_count; // assume bit_count fits in 32 bits } bmw_4way_small_context; @@ -58,13 +56,19 @@ typedef bmw_4way_small_context bmw256_4way_context; void bmw256_4way_init( bmw256_4way_context *ctx ); void bmw256_4way_update(void *cc, const void *data, size_t len); -#define bmw256_4way bmw256_4way_update void bmw256_4way_close(void *cc, void *dst); void bmw256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); +#define bmw256_4x32_context bmw256_4way_context +#define bmw256_4x32_init bmw256_4way_init +#define bmw256_4x32_update bmw256_4way_update +#define bmw256_4x32_close bmw256_4way_close + +#endif + #if defined(__AVX2__) // BMW-256 8 way 32 @@ -85,6 +89,11 @@ void bmw256_8way_update( bmw256_8way_context *ctx, const void *data, #define bmw256_8way bmw256_8way_update void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ); +#define bmw256_8x32_context bmw256_8way_context +#define bmw256_8x32_init bmw256_8way_init +#define bmw256_8x32_update bmw256_8way_update +#define bmw256_8x32_close bmw256_8way_close + #endif #if defined(SIMD512) @@ -106,6 +115,11 @@ void bmw256_16way_update( bmw256_16way_context *ctx, const void *data, size_t len ); void bmw256_16way_close( bmw256_16way_context *ctx, void *dst ); +#define bmw256_16x32_context bmw256_16way_context +#define bmw256_16x32_init bmw256_16way_init +#define bmw256_16x32_update bmw256_16way_update +#define bmw256_16x32_close bmw256_16way_close + #endif // BMW-512 2 way 64 diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index 6392028..9c8916a 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -45,7 +45,7 @@ extern "C"{ #define LPAR ( -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) // BMW-256 4 way 32 /* @@ -284,9 +284,9 @@ static const uint32_t IV256[] = { v128_xor( M[13], H[13] ) ) ) -void compress_small( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] ) +void compress_small( const v128u32_t *M, const v128u32_t H[16], v128u32_t dH[16] ) { - v128u64_t qt[32], xl, xh; \ + v128u32_t qt[32], xl, xh; \ qt[ 0] = v128_add32( ss0( Ws0 ), H[ 1] ); qt[ 1] = v128_add32( ss1( Ws1 ), H[ 2] ); @@ -428,49 +428,25 @@ static const uint32_t final_s[16][4] = { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae }, { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf } }; -/* -static const v128u64_t final_s[16] = -{ - { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 }, - { 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 }, - { 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 }, - { 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 }, - { 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 }, - { 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 }, - { 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 }, - { 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 }, - { 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 }, - { 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 }, - { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa }, - { 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab }, - { 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac }, - { 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad }, - { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae }, - { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf } -}; -*/ + void bmw256_4way_init( bmw256_4way_context *ctx ) { - ctx->H[ 0] = v128_64( 0x4041424340414243 ); - ctx->H[ 1] = v128_64( 0x4445464744454647 ); - ctx->H[ 2] = v128_64( 0x48494A4B48494A4B ); - ctx->H[ 3] = v128_64( 0x4C4D4E4F4C4D4E4F ); - ctx->H[ 4] = v128_64( 0x5051525350515253 ); - ctx->H[ 5] = v128_64( 0x5455565754555657 ); - ctx->H[ 6] = v128_64( 0x58595A5B58595A5B ); - ctx->H[ 7] = v128_64( 0x5C5D5E5F5C5D5E5F ); - ctx->H[ 8] = v128_64( 0x6061626360616263 ); - ctx->H[ 9] = v128_64( 0x6465666764656667 ); - ctx->H[10] = v128_64( 0x68696A6B68696A6B ); - ctx->H[11] = v128_64( 0x6C6D6E6F6C6D6E6F ); - ctx->H[12] = v128_64( 0x7071727370717273 ); - ctx->H[13] = v128_64( 0x7475767774757677 ); - ctx->H[14] = v128_64( 0x78797A7B78797A7B ); - ctx->H[15] = v128_64( 0x7C7D7E7F7C7D7E7F ); - - -// for ( int i = 0; i < 16; i++ ) -// sc->H[i] = v128_32( iv[i] ); + ctx->H[ 0] = v128_32( 0x40414243 ); + ctx->H[ 1] = v128_32( 0x44454647 ); + ctx->H[ 2] = v128_32( 0x48494A4B ); + ctx->H[ 3] = v128_32( 0x4C4D4E4F ); + ctx->H[ 4] = v128_32( 0x50515253 ); + ctx->H[ 5] = v128_32( 0x54555657 ); + ctx->H[ 6] = v128_32( 0x58595A5B ); + ctx->H[ 7] = v128_32( 0x5C5D5E5F ); + ctx->H[ 8] = v128_32( 0x60616263 ); + ctx->H[ 9] = v128_32( 0x64656667 ); + ctx->H[10] = v128_32( 0x68696A6B ); + ctx->H[11] = v128_32( 0x6C6D6E6F ); + ctx->H[12] = v128_32( 0x70717273 ); + ctx->H[13] = v128_32( 0x74757677 ); + ctx->H[14] = v128_32( 0x78797A7B ); + ctx->H[15] = v128_32( 0x7C7D7E7F ); ctx->ptr = 0; ctx->bit_count = 0; } @@ -478,10 +454,10 @@ void bmw256_4way_init( bmw256_4way_context *ctx ) static void bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) { - v128u64_t *vdata = (v128u64_t*)data; - v128u64_t *buf; - v128u64_t htmp[16]; - v128u64_t *h1, *h2; + v128u32_t *vdata = (v128u32_t*)data; + v128u32_t *buf; + v128u32_t htmp[16]; + v128u32_t *h1, *h2; size_t ptr; const int buf_size = 64; // bytes of one lane, compatible with len @@ -503,7 +479,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) ptr += clen; if ( ptr == buf_size ) { - v128u64_t *ht; + v128u32_t *ht; compress_small( buf, h1, h2 ); ht = h1; h1 = h2; @@ -521,14 +497,14 @@ static void bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32) { - v128u64_t *buf; - v128u64_t h1[16], h2[16], *h; + v128u32_t *buf; + v128u32_t h1[16], h2[16], *h; size_t ptr, u, v; const int buf_size = 64; // bytes of one lane, compatible with len buf = sc->buf; ptr = sc->ptr; - buf[ ptr>>2 ] = v128_64( 0x0000008000000080 ); + buf[ ptr>>2 ] = v128_32( 0x00000080 ); ptr += 4; h = sc->H; @@ -548,7 +524,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, for ( u = 0; u < 16; u ++ ) buf[u] = h2[u]; - compress_small( buf, (v128u64_t*)final_s, h1 ); + compress_small( buf, (v128u32_t*)final_s, h1 ); for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) casti_v128( dst, u ) = h1[v]; diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index 6fab0ef..a502475 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -39,7 +39,7 @@ static void transform( cubehashParam *sp ) #elif defined(__AVX2__) - register __m256i x0, x1, x2, x3, y0, y1; + register __m256i x0, x1, x2, x3, t0; x0 = _mm256_load_si256( (__m256i*)sp->x ); x1 = _mm256_load_si256( (__m256i*)sp->x + 1 ); @@ -50,10 +50,10 @@ static void transform( cubehashParam *sp ) { x2 = _mm256_add_epi32( x0, x2 ); x3 = _mm256_add_epi32( x1, x3 ); - y0 = mm256_rol_32( x1, 7 ); - y1 = mm256_rol_32( x0, 7 ); - x0 = _mm256_xor_si256( y0, x2 ); - x1 = _mm256_xor_si256( y1, x3 ); + t0 = mm256_rol_32( x1, 7 ); + x1 = mm256_rol_32( x0, 7 ); + x0 = _mm256_xor_si256( t0, x2 ); + x1 = _mm256_xor_si256( x1, x3 ); x2 = mm256_swap128_64( x2 ); x3 = mm256_swap128_64( x3 ); x2 = _mm256_add_epi32( x0, x2 ); @@ -75,7 +75,7 @@ static void transform( cubehashParam *sp ) #else // AVX, SSE2, NEON - v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3; + v128_t x0, x1, x2, x3, x4, x5, x6, x7, t0, t1; x0 = casti_v128( sp->x, 0 ); x1 = casti_v128( sp->x, 1 ); @@ -92,16 +92,12 @@ static void transform( cubehashParam *sp ) x5 = v128_add32( x1, x5 ); x6 = v128_add32( x2, x6 ); x7 = v128_add32( x3, x7 ); - y0 = x2; - y1 = x3; - y2 = x0; - y3 = x1; - x0 = v128_rol32( y0, 7 ); - x1 = v128_rol32( y1, 7 ); - x2 = v128_rol32( y2, 7 ); - x3 = v128_rol32( y3, 7 ); - x0 = v128_xor( x0, x4 ); - x1 = v128_xor( x1, x5 ); + t0 = v128_rol32( x2, 7 ); + t1 = v128_rol32( x3, 7 ); + x2 = v128_rol32( x0, 7 ); + x3 = v128_rol32( x1, 7 ); + x0 = v128_xor( t0, x4 ); + x1 = v128_xor( t1, x5 ); x2 = v128_xor( x2, x6 ); x3 = v128_xor( x3, x7 ); x4 = v128_swap64( x4 ); @@ -112,19 +108,15 @@ static void transform( cubehashParam *sp ) x5 = v128_add32( x1, x5 ); x6 = v128_add32( x2, x6 ); x7 = v128_add32( x3, x7 ); - y0 = x1; - y1 = x0; - y2 = x3; - y3 = x2; - x0 = v128_rol32( y0, 11 ); - x1 = v128_rol32( y1, 11 ); - x2 = v128_rol32( y2, 11 ); - x3 = v128_rol32( y3, 11 ); - x0 = v128_xor( x0, x4 ); - x1 = v128_xor( x1, x5 ); - x2 = v128_xor( x2, x6 ); - x3 = v128_xor( x3, x7 ); - x4 = v128_swap64_32( x4 ); + t0 = v128_rol32( x1, 11 ); + x1 = v128_rol32( x0, 11 ); + t1 = v128_rol32( x3, 11 ); + x3 = v128_rol32( x2, 11 ); + x0 = v128_xor( t0, x4 ); + x1 = v128_xor( x1, x5 ); + x2 = v128_xor( t1, x6 ); + x3 = v128_xor( x3, x7 ); + x4 = v128_swap64_32( x4 ); x5 = v128_swap64_32( x5 ); x6 = v128_swap64_32( x6 ); x7 = v128_swap64_32( x7 ); diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index 1a374d3..8f9d588 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -17,7 +17,7 @@ typedef struct { #else hashState_groestl groestl; #endif - sha256_8way_context sha; + sha256_8x32_context sha; } myrgr_8way_ctx_holder; myrgr_8way_ctx_holder myrgr_8way_ctx; @@ -29,7 +29,7 @@ void init_myrgr_8way_ctx() #else init_groestl( &myrgr_8way_ctx.groestl, 64 ); #endif - sha256_8way_init( &myrgr_8way_ctx.sha ); + sha256_8x32_init( &myrgr_8way_ctx.sha ); } void myriad_8way_hash( void *output, const void *input ) @@ -96,8 +96,8 @@ void myriad_8way_hash( void *output, const void *input ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - sha256_8way_update( &ctx.sha, vhash, 64 ); - sha256_8way_close( &ctx.sha, output ); + sha256_8x32_update( &ctx.sha, vhash, 64 ); + sha256_8x32_close( &ctx.sha, output ); } int scanhash_myriad_8way( struct work *work, uint32_t max_nonce, @@ -156,7 +156,7 @@ int scanhash_myriad_8way( struct work *work, uint32_t max_nonce, typedef struct { hashState_groestl groestl; - sha256_4way_context sha; + sha256_4x32_context sha; } myrgr_4way_ctx_holder; myrgr_4way_ctx_holder myrgr_4way_ctx; @@ -164,7 +164,7 @@ myrgr_4way_ctx_holder myrgr_4way_ctx; void init_myrgr_4way_ctx() { init_groestl (&myrgr_4way_ctx.groestl, 64 ); - sha256_4way_init( &myrgr_4way_ctx.sha ); + sha256_4x32_init( &myrgr_4way_ctx.sha ); } void myriad_4way_hash( void *output, const void *input ) @@ -189,8 +189,8 @@ void myriad_4way_hash( void *output, const void *input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - sha256_4way_update( &ctx.sha, vhash, 64 ); - sha256_4way_close( &ctx.sha, output ); + sha256_4x32_update( &ctx.sha, vhash, 64 ); + sha256_4x32_close( &ctx.sha, output ); } int scanhash_myriad_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index a3c5bfe..87c4bfa 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -1059,7 +1059,7 @@ void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num ) WRITE_STATE_BIG8( sc ); } -void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf ) +void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf ) { __m512i m0, m1, m2, m3, m4, m5, m6, m7; @@ -1071,7 +1071,7 @@ void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf ) WRITE_STATE_BIG8( sc ); } -void hamsi512_8way_init( hamsi_8way_big_context *sc ) +void hamsi512_8x64_init( hamsi512_8x64_context *sc ) { sc->partial_len = 0; sc->count_high = sc->count_low = 0; @@ -1087,7 +1087,7 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc ) sc->h[7] = v512_64( iv[7] ); } -void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data, +void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data, size_t len ) { __m512i *vdata = (__m512i*)data; @@ -1099,7 +1099,7 @@ void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data, sc->partial_len = len; } -void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst ) +void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst ) { __m512i pad[1]; uint32_t ch, cl; @@ -1944,7 +1944,7 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst, //////////// -void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) +void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num ) { DECL_STATE_BIG uint32_t tmp; @@ -1968,7 +1968,7 @@ void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) WRITE_STATE_BIG( sc ); } -void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf ) +void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf ) { __m256i m0, m1, m2, m3, m4, m5, m6, m7; DECL_STATE_BIG @@ -1979,7 +1979,7 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf ) WRITE_STATE_BIG( sc ); } -void hamsi512_4way_init( hamsi_4way_big_context *sc ) +void hamsi512_4x64_init( hamsi512_4x64_context *sc ) { sc->partial_len = 0; sc->count_high = sc->count_low = 0; @@ -1994,7 +1994,7 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc ) sc->h[7] = v256_64( iv[7] ); } -void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data, +void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; @@ -2006,7 +2006,7 @@ void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data, sc->partial_len = len; } -void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst ) +void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst ) { __m256i pad[1]; uint32_t ch, cl; diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h index 15d964e..0b32c6f 100644 --- a/algo/hamsi/hamsi-hash-4way.h +++ b/algo/hamsi/hamsi-hash-4way.h @@ -72,17 +72,17 @@ typedef struct size_t partial_len; uint32_t count_high, count_low; } hamsi_4way_big_context; -typedef hamsi_4way_big_context hamsi512_4way_context; +typedef hamsi_4way_big_context hamsi512_4x64_context; -void hamsi512_4way_init( hamsi512_4way_context *sc ); -void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data, +void hamsi512_4x64_init( hamsi512_4x64_context *sc ); +void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data, size_t len ); -void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst ); +void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst ); -#define hamsi512_4x64_context hamsi512_4way_context -#define hamsi512_4x64_init hamsi512_4way_init -#define hamsi512_4x64_update hamsi512_4way_update -#define hamsi512_4x64_close hamsi512_4way_close +#define hamsi512_4way_context hamsi512_4x64_context +#define hamsi512_4way_init hamsi512_4x64_init +#define hamsi512_4way_update hamsi512_4x64_update +#define hamsi512_4way_close hamsi512_4x64_close // Hamsi-512 8x32 @@ -115,17 +115,17 @@ typedef struct size_t partial_len; uint32_t count_high, count_low; } hamsi_8way_big_context; -typedef hamsi_8way_big_context hamsi512_8way_context; +typedef hamsi_8way_big_context hamsi512_8x64_context; -void hamsi512_8way_init( hamsi512_8way_context *sc ); -void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data, +void hamsi512_8x64_init( hamsi512_8x64_context *sc ); +void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data, size_t len ); -void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst ); +void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst ); -#define hamsi512_8x64_context hamsi512_8way_context -#define hamsi512_8x64_init hamsi512_8way_init -#define hamsi512_8x64_update hamsi512_8way_update -#define hamsi512_8x64_close hamsi512_8way_close +#define hamsi512_8way_context hamsi512_8x64_context +#define hamsi512_8way_init hamsi512_8x64_init +#define hamsi512_8way_update hamsi512_8x64_update +#define hamsi512_8way_close hamsi512_8x64_close // Hamsi-512 16x32 diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h index 0006d96..d563b64 100644 --- a/algo/haval/haval-hash-4way.h +++ b/algo/haval/haval-hash-4way.h @@ -82,12 +82,15 @@ typedef struct { typedef haval_4way_context haval256_5_4way_context; void haval256_5_4way_init( void *cc ); - void haval256_5_4way_update( void *cc, const void *data, size_t len ); //#define haval256_5_4way haval256_5_4way_update - void haval256_5_4way_close( void *cc, void *dst ); +#define haval256_4x32_context haval256_5_4way_context +#define haval256_4x32_init haval256_5_4way_init +#define haval256_4x32_update haval256_5_4way_update +#define haval256_4x32_close haval256_5_4way_close + #if defined(__AVX2__) typedef struct { @@ -100,11 +103,14 @@ typedef struct { typedef haval_8way_context haval256_5_8way_context; void haval256_5_8way_init( void *cc ); - void haval256_5_8way_update( void *cc, const void *data, size_t len ); - void haval256_5_8way_close( void *cc, void *dst ); +#define haval256_8x32_context haval256_5_8way_context +#define haval256_8x32_init haval256_5_8way_init +#define haval256_8x32_update haval256_5_8way_update +#define haval256_8x32_close haval256_5_8way_close + #endif // AVX2 #if defined(SIMD512) @@ -119,11 +125,14 @@ typedef struct { typedef haval_16way_context haval256_5_16way_context; void haval256_5_16way_init( void *cc ); - void haval256_5_16way_update( void *cc, const void *data, size_t len ); - void haval256_5_16way_close( void *cc, void *dst ); +#define haval256_16x32_context haval256_5_16way_context +#define haval256_16x32_init haval256_5_16way_init +#define haval256_16x32_update haval256_5_16way_update +#define haval256_16x32_close haval256_5_16way_close + #endif // AVX512 #ifdef __cplusplus diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index d23440b..e3fc996 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -190,7 +190,7 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst, memcpy_512( dst, kc->w, m512_len ); } -void keccak256_8way_init( void *kc ) +void keccak256_8x64_init( void *kc ) { keccak64_8way_init( kc, 256 ); } diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c index ba885b3..0e09042 100644 --- a/algo/keccak/sha3d-4way.c +++ b/algo/keccak/sha3d-4way.c @@ -9,7 +9,7 @@ void sha3d_hash_8way(void *state, const void *input) { uint32_t buffer[16*8] __attribute__ ((aligned (128))); - keccak256_8way_context ctx; + keccak256_8x64_context ctx; keccak256_8x64_init( &ctx ); keccak256_8x64_update( &ctx, input, 80 ); @@ -69,7 +69,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, void sha3d_hash_4way(void *state, const void *input) { uint32_t buffer[16*4] __attribute__ ((aligned (64))); - keccak256_4way_context ctx; + keccak256_4x64_context ctx; keccak256_4x64_init( &ctx ); keccak256_4x64_update( &ctx, input, 80 ); diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index 54df488..ec35ddf 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -273,8 +273,6 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b ) uint32_t hash[8*4] __attribute((aligned(128))); __m512i* chainv = state->chainv; __m512i t[2]; - const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); /*---- blank round with m=0 ----*/ rnd512_4way( state, NULL ); @@ -289,10 +287,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b ) _mm512_store_si512( (__m512i*)&hash[ 0], t[0] ); _mm512_store_si512( (__m512i*)&hash[16], t[1] ); - casti_m512i( b,0 ) = _mm512_shuffle_epi8( - casti_m512i( hash,0 ), shuff_bswap32 ); - casti_m512i( b,1 ) = _mm512_shuffle_epi8( - casti_m512i( hash,1 ), shuff_bswap32 ); + casti_m512i( b,0 ) = mm512_bswap_32( casti_m512i( hash,0 ) ); + casti_m512i( b,1 ) = mm512_bswap_32( casti_m512i( hash,1 ) ); rnd512_4way( state, NULL ); @@ -306,10 +302,8 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b ) _mm512_store_si512( (__m512i*)&hash[ 0], t[0] ); _mm512_store_si512( (__m512i*)&hash[16], t[1] ); - casti_m512i( b,2 ) = _mm512_shuffle_epi8( - casti_m512i( hash,0 ), shuff_bswap32 ); - casti_m512i( b,3 ) = _mm512_shuffle_epi8( - casti_m512i( hash,1 ), shuff_bswap32 ); + casti_m512i( b,2 ) = mm512_bswap_32( casti_m512i( hash,0 ) ); + casti_m512i( b,3 ) = mm512_bswap_32( casti_m512i( hash,1 ) ); } int luffa_4way_init( luffa_4way_context *state, int hashbitlen ) @@ -349,16 +343,14 @@ int luffa_4way_update( luffa_4way_context *state, const void *data, __m512i msg[2]; int i; int blocks = (int)len >> 5; - const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); state->rembytes = (int)len & 0x1F; // full blocks for ( i = 0; i < blocks; i++, vdata+=2 ) { - msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + msg[0] = mm512_bswap_32( vdata[ 0 ] ); + msg[1] = mm512_bswap_32( vdata[ 1 ] ); rnd512_4way( state, msg ); } @@ -367,7 +359,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data, if ( state->rembytes ) { // remaining data bytes - buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 ); + buffer[0] = mm512_bswap_32( vdata[0] ); buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 ); } return 0; @@ -434,16 +426,14 @@ int luffa512_4way_full( luffa_4way_context *state, void *output, __m512i msg[2]; int i; const int blocks = (int)( inlen >> 5 ); - const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); state->rembytes = inlen & 0x1F; // full blocks for ( i = 0; i < blocks; i++, vdata+=2 ) { - msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + msg[0] = mm512_bswap_32( vdata[ 0 ] ); + msg[1] = mm512_bswap_32( vdata[ 1 ] ); rnd512_4way( state, msg ); } @@ -451,7 +441,7 @@ int luffa512_4way_full( luffa_4way_context *state, void *output, if ( state->rembytes ) { // padding of partial block - msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[0] = mm512_bswap_32( vdata[ 0 ] ); msg[1] = mm512_bcast128lo_64( 0x0000000080000000 ); rnd512_4way( state, msg ); } @@ -479,16 +469,14 @@ int luffa_4way_update_close( luffa_4way_context *state, __m512i msg[2]; int i; const int blocks = (int)( inlen >> 5 ); - const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); state->rembytes = inlen & 0x1F; // full blocks for ( i = 0; i < blocks; i++, vdata+=2 ) { - msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + msg[0] = mm512_bswap_32( vdata[ 0 ] ); + msg[1] = mm512_bswap_32( vdata[ 1 ] ); rnd512_4way( state, msg ); } @@ -496,7 +484,7 @@ int luffa_4way_update_close( luffa_4way_context *state, if ( state->rembytes ) { // padding of partial block - msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[0] = mm512_bswap_32( vdata[ 0 ] ); msg[1] = mm512_bcast128lo_64( 0x0000000080000000 ); rnd512_4way( state, msg ); } @@ -775,8 +763,6 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b ) uint32 hash[8*2] __attribute((aligned(64))); __m256i* chainv = state->chainv; __m256i t0, t1; - const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); /*---- blank round with m=0 ----*/ rnd512_2way( state, NULL ); @@ -791,10 +777,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b ) _mm256_store_si256( (__m256i*)&hash[0], t0 ); _mm256_store_si256( (__m256i*)&hash[8], t1 ); - casti_m256i( b, 0 ) = _mm256_shuffle_epi8( - casti_m256i( hash, 0 ), shuff_bswap32 ); - casti_m256i( b, 1 ) = _mm256_shuffle_epi8( - casti_m256i( hash, 1 ), shuff_bswap32 ); + casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); + casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); rnd512_2way( state, NULL ); @@ -809,10 +793,8 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b ) _mm256_store_si256( (__m256i*)&hash[0], t0 ); _mm256_store_si256( (__m256i*)&hash[8], t1 ); - casti_m256i( b, 2 ) = _mm256_shuffle_epi8( - casti_m256i( hash, 0 ), shuff_bswap32 ); - casti_m256i( b, 3 ) = _mm256_shuffle_epi8( - casti_m256i( hash, 1 ), shuff_bswap32 ); + casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); + casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); } int luffa_2way_init( luffa_2way_context *state, int hashbitlen ) @@ -847,15 +829,13 @@ int luffa_2way_update( luffa_2way_context *state, const void *data, __m256i msg[2]; int i; int blocks = (int)len >> 5; - const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); state-> rembytes = (int)len & 0x1F; // full blocks for ( i = 0; i < blocks; i++, vdata+=2 ) { - msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + msg[0] = mm256_bswap_32( vdata[ 0 ] ); + msg[1] = mm256_bswap_32( vdata[ 1 ] ); rnd512_2way( state, msg ); } @@ -864,7 +844,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data, if ( state->rembytes ) { // remaining data bytes - buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 ); + buffer[0] = mm256_bswap_32( vdata[0] ); buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 ); } return 0; @@ -916,16 +896,14 @@ int luffa512_2way_full( luffa_2way_context *state, void *output, __m256i msg[2]; int i; const int blocks = (int)( inlen >> 5 ); - const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); state->rembytes = inlen & 0x1F; // full blocks for ( i = 0; i < blocks; i++, vdata+=2 ) { - msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + msg[0] = mm256_bswap_32( vdata[ 0 ] ); + msg[1] = mm256_bswap_32( vdata[ 1 ] ); rnd512_2way( state, msg ); } @@ -933,7 +911,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output, if ( state->rembytes ) { // padding of partial block - msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[0] = mm256_bswap_32( vdata[ 0 ] ); msg[1] = mm256_bcast128lo_64( 0x0000000080000000 ); rnd512_2way( state, msg ); } @@ -961,16 +939,14 @@ int luffa_2way_update_close( luffa_2way_context *state, __m256i msg[2]; int i; const int blocks = (int)( inlen >> 5 ); - const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); state->rembytes = inlen & 0x1F; // full blocks for ( i = 0; i < blocks; i++, vdata+=2 ) { - msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); - msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 ); + msg[0] = mm256_bswap_32( vdata[ 0 ] ); + msg[1] = mm256_bswap_32( vdata[ 1 ] ); rnd512_2way( state, msg ); } @@ -978,7 +954,7 @@ int luffa_2way_update_close( luffa_2way_context *state, if ( state->rembytes ) { // padding of partial block - msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 ); + msg[0] = mm256_bswap_32( vdata[ 0 ] ); msg[1] = mm256_bcast128lo_64( 0x0000000080000000 ); rnd512_2way( state, msg ); } diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c index 453ab12..c2d790b 100644 --- a/algo/lyra2/allium-4way.c +++ b/algo/lyra2/allium-4way.c @@ -26,9 +26,9 @@ #if defined (ALLIUM_16WAY) typedef union { - keccak256_8way_context keccak; + keccak256_8x64_context keccak; cube_4way_2buf_context cube; - skein256_8way_context skein; + skein256_8x64_context skein; #if defined(__VAES__) groestl256_4way_context groestl; #else @@ -60,7 +60,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars, uint32_t hash15[8] __attribute__ ((aligned (32))); allium_16way_ctx_holder ctx __attribute__ ((aligned (64))); - blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); + blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, @@ -70,12 +70,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars, intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, 256 ); - keccak256_8way_init( &ctx.keccak ); - keccak256_8way_update( &ctx.keccak, vhashA, 32 ); - keccak256_8way_close( &ctx.keccak, vhashA); - keccak256_8way_init( &ctx.keccak ); - keccak256_8way_update( &ctx.keccak, vhashB, 32 ); - keccak256_8way_close( &ctx.keccak, vhashB); + keccak256_8x64_init( &ctx.keccak ); + keccak256_8x64_update( &ctx.keccak, vhashA, 32 ); + keccak256_8x64_close( &ctx.keccak, vhashA); + keccak256_8x64_init( &ctx.keccak ); + keccak256_8x64_update( &ctx.keccak, vhashB, 32 ); + keccak256_8x64_close( &ctx.keccak, vhashB); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhashA, 256 ); @@ -153,12 +153,12 @@ static void allium_16way_hash( void *state, const void *midstate_vars, intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, 256 ); - skein256_8way_init( &ctx.skein ); - skein256_8way_update( &ctx.skein, vhashA, 32 ); - skein256_8way_close( &ctx.skein, vhashA ); - skein256_8way_init( &ctx.skein ); - skein256_8way_update( &ctx.skein, vhashB, 32 ); - skein256_8way_close( &ctx.skein, vhashB ); + skein256_8x64_init( &ctx.skein ); + skein256_8x64_update( &ctx.skein, vhashA, 32 ); + skein256_8x64_close( &ctx.skein, vhashA ); + skein256_8x64_init( &ctx.skein ); + skein256_8x64_update( &ctx.skein, vhashB, 32 ); + skein256_8x64_close( &ctx.skein, vhashB ); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhashA, 256 ); @@ -251,7 +251,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ); // Partialy prehash second block without touching nonces in block_buf[3]. - blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { allium_16way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -273,9 +273,9 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce, #elif defined (ALLIUM_8WAY) typedef union { - keccak256_4way_context keccak; + keccak256_4x64_context keccak; cube_2way_context cube; - skein256_4way_context skein; + skein256_4x64_context skein; #if defined(__VAES__) groestl256_2way_context groestl; #else @@ -298,19 +298,19 @@ static void allium_8way_hash( void *hash, const void *midstate_vars, uint64_t *hash7 = (uint64_t*)hash+28; allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); - blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 ); + blake256_8x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhashA, 256 ); intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 ); intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 ); - keccak256_4way_init( &ctx.keccak ); - keccak256_4way_update( &ctx.keccak, vhashA, 32 ); - keccak256_4way_close( &ctx.keccak, vhashA ); - keccak256_4way_init( &ctx.keccak ); - keccak256_4way_update( &ctx.keccak, vhashB, 32 ); - keccak256_4way_close( &ctx.keccak, vhashB ); + keccak256_4x64_init( &ctx.keccak ); + keccak256_4x64_update( &ctx.keccak, vhashA, 32 ); + keccak256_4x64_close( &ctx.keccak, vhashA ); + keccak256_4x64_init( &ctx.keccak ); + keccak256_4x64_update( &ctx.keccak, vhashB, 32 ); + keccak256_4x64_close( &ctx.keccak, vhashB ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 ); dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 ); @@ -350,12 +350,12 @@ static void allium_8way_hash( void *hash, const void *midstate_vars, intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 ); intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 ); - skein256_4way_init( &ctx.skein ); - skein256_4way_update( &ctx.skein, vhashA, 32 ); - skein256_4way_close( &ctx.skein, vhashA ); - skein256_4way_init( &ctx.skein ); - skein256_4way_update( &ctx.skein, vhashB, 32 ); - skein256_4way_close( &ctx.skein, vhashB ); + skein256_4x64_init( &ctx.skein ); + skein256_4x64_update( &ctx.skein, vhashA, 32 ); + skein256_4x64_close( &ctx.skein, vhashA ); + skein256_4x64_init( &ctx.skein ); + skein256_4x64_update( &ctx.skein, vhashB, 32 ); + skein256_4x64_close( &ctx.skein, vhashB ); #if defined(__VAES__) @@ -433,7 +433,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce, n+ 3, n+ 2, n+ 1, n ); // Partialy prehash second block without touching nonces - blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { allium_8way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -483,7 +483,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars, uint64_t *hash3 = (uint64_t*)hash+12; allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); - blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 ); + blake256_4x32_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 ); intrlv_2x64( vhashA, hash0, hash1, 256 ); @@ -588,7 +588,7 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce, block_buf[15] = v128_32( 640 ); // Partialy prehash second block without touching nonces - blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { allium_4way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -616,7 +616,6 @@ int scanhash_allium_4way( struct work *work, uint32_t max_nonce, // // 1 way - typedef struct { blake256_context blake; diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c index 453177c..fae39c1 100644 --- a/algo/lyra2/lyra2h-4way.c +++ b/algo/lyra2/lyra2h-4way.c @@ -14,12 +14,12 @@ bool lyra2h_4way_thread_init() return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) ); } -static __thread blake256_4way_context l2h_4way_blake_mid; +static __thread blake256_4x32_context l2h_4way_blake_mid; void lyra2h_4way_midstate( const void* input ) { - blake256_4way_init( &l2h_4way_blake_mid ); - blake256_4way_update( &l2h_4way_blake_mid, input, 64 ); + blake256_4x32_init( &l2h_4way_blake_mid ); + blake256_4x32_update( &l2h_4way_blake_mid, input, 64 ); } void lyra2h_4way_hash( void *state, const void *input ) @@ -29,11 +29,11 @@ void lyra2h_4way_hash( void *state, const void *input ) uint32_t hash2[8] __attribute__ ((aligned (64))); uint32_t hash3[8] __attribute__ ((aligned (64))); uint32_t vhash[8*4] __attribute__ ((aligned (64))); - blake256_4way_context ctx_blake __attribute__ ((aligned (64))); + blake256_4x32_context ctx_blake __attribute__ ((aligned (64))); memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid ); - blake256_4way_update( &ctx_blake, input + (64*4), 16 ); - blake256_4way_close( &ctx_blake, vhash ); + blake256_4x32_update( &ctx_blake, input + (64*4), 16 ); + blake256_4x32_close( &ctx_blake, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c index 32ef639..0cb0303 100644 --- a/algo/lyra2/lyra2rev2-4way.c +++ b/algo/lyra2/lyra2rev2-4way.c @@ -7,25 +7,24 @@ #include "algo/cubehash/cubehash_sse2.h" #include "algo/cubehash/cube-hash-2way.h" - #if defined (LYRA2REV2_16WAY) typedef struct { - blake256_16way_context blake; - keccak256_8way_context keccak; + blake256_16x32_context blake; + keccak256_8x64_context keccak; cubehashParam cube; - skein256_8way_context skein; - bmw256_16way_context bmw; + skein256_8x64_context skein; + bmw256_16x32_context bmw; } lyra2v2_16way_ctx_holder __attribute__ ((aligned (64))); static lyra2v2_16way_ctx_holder l2v2_16way_ctx; bool init_lyra2rev2_16way_ctx() { - keccak256_8way_init( &l2v2_16way_ctx.keccak ); + keccak256_8x64_init( &l2v2_16way_ctx.keccak ); cubehashInit( &l2v2_16way_ctx.cube, 256, 16, 32 ); - skein256_8way_init( &l2v2_16way_ctx.skein ); - bmw256_16way_init( &l2v2_16way_ctx.bmw ); + skein256_8x64_init( &l2v2_16way_ctx.skein ); + bmw256_16x32_init( &l2v2_16way_ctx.bmw ); return true; } @@ -51,8 +50,8 @@ void lyra2rev2_16way_hash( void *state, const void *input ) lyra2v2_16way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v2_16way_ctx, sizeof(l2v2_16way_ctx) ); - blake256_16way_update( &ctx.blake, input + (64<<4), 16 ); - blake256_16way_close( &ctx.blake, vhash ); + blake256_16x32_update( &ctx.blake, input + (64<<4), 16 ); + blake256_16x32_close( &ctx.blake, vhash ); dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, @@ -62,17 +61,17 @@ void lyra2rev2_16way_hash( void *state, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 256 ); - keccak256_8way_update( &ctx.keccak, vhash, 32 ); - keccak256_8way_close( &ctx.keccak, vhash ); + keccak256_8x64_update( &ctx.keccak, vhash, 32 ); + keccak256_8x64_close( &ctx.keccak, vhash ); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 256 ); intrlv_8x64( vhash, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, 256 ); - keccak256_8way_init( &ctx.keccak ); - keccak256_8way_update( &ctx.keccak, vhash, 32 ); - keccak256_8way_close( &ctx.keccak, vhash ); + keccak256_8x64_init( &ctx.keccak ); + keccak256_8x64_update( &ctx.keccak, vhash, 32 ); + keccak256_8x64_close( &ctx.keccak, vhash ); dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, vhash, 256 ); @@ -122,21 +121,20 @@ void lyra2rev2_16way_hash( void *state, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 256 ); - skein256_8way_update( &ctx.skein, vhash, 32 ); - skein256_8way_close( &ctx.skein, vhash ); + skein256_8x64_update( &ctx.skein, vhash, 32 ); + skein256_8x64_close( &ctx.skein, vhash ); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 256 ); intrlv_8x64( vhash, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, 256 ); - skein256_8way_init( &ctx.skein ); - skein256_8way_update( &ctx.skein, vhash, 32 ); - skein256_8way_close( &ctx.skein, vhash ); + skein256_8x64_init( &ctx.skein ); + skein256_8x64_update( &ctx.skein, vhash, 32 ); + skein256_8x64_close( &ctx.skein, vhash ); dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, vhash, 256 ); - cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 ); cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 ); @@ -160,8 +158,8 @@ void lyra2rev2_16way_hash( void *state, const void *input ) hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, 256 ); - bmw256_16way_update( &ctx.bmw, vhash, 32 ); - bmw256_16way_close( &ctx.bmw, state ); + bmw256_16x32_update( &ctx.bmw, vhash, 32 ); + bmw256_16x32_close( &ctx.bmw, state ); } int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce, @@ -186,8 +184,8 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce, mm512_bswap32_intrlv80_16x32( vdata, pdata ); *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ); - blake256_16way_init( &l2v2_16way_ctx.blake ); - blake256_16way_update( &l2v2_16way_ctx.blake, vdata, 64 ); + blake256_16x32_init( &l2v2_16way_ctx.blake ); + blake256_16x32_update( &l2v2_16way_ctx.blake, vdata, 64 ); do { @@ -214,21 +212,21 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce, #elif defined (LYRA2REV2_8WAY) typedef struct { - blake256_8way_context blake; - keccak256_4way_context keccak; + blake256_8x32_context blake; + keccak256_4x64_context keccak; cubehashParam cube; - skein256_4way_context skein; - bmw256_8way_context bmw; + skein256_4x64_context skein; + bmw256_8x32_context bmw; } lyra2v2_8way_ctx_holder __attribute__ ((aligned (64))); static lyra2v2_8way_ctx_holder l2v2_8way_ctx; bool init_lyra2rev2_8way_ctx() { - keccak256_4way_init( &l2v2_8way_ctx.keccak ); + keccak256_4x64_init( &l2v2_8way_ctx.keccak ); cubehashInit( &l2v2_8way_ctx.cube, 256, 16, 32 ); - skein256_4way_init( &l2v2_8way_ctx.skein ); - bmw256_8way_init( &l2v2_8way_ctx.bmw ); + skein256_4x64_init( &l2v2_8way_ctx.skein ); + bmw256_8x32_init( &l2v2_8way_ctx.bmw ); return true; } @@ -246,20 +244,20 @@ void lyra2rev2_8way_hash( void *state, const void *input ) lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) ); - blake256_8way_update( &ctx.blake, input + (64<<3), 16 ); - blake256_8way_close( &ctx.blake, vhash ); + blake256_8x32_update( &ctx.blake, input + (64<<3), 16 ); + blake256_8x32_close( &ctx.blake, vhash ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 256 ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 ); - keccak256_4way_update( &ctx.keccak, vhash, 32 ); - keccak256_4way_close( &ctx.keccak, vhash ); + keccak256_4x64_update( &ctx.keccak, vhash, 32 ); + keccak256_4x64_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 ); intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 ); - keccak256_4way_init( &ctx.keccak ); - keccak256_4way_update( &ctx.keccak, vhash, 32 ); - keccak256_4way_close( &ctx.keccak, vhash ); + keccak256_4x64_init( &ctx.keccak ); + keccak256_4x64_update( &ctx.keccak, vhash, 32 ); + keccak256_4x64_close( &ctx.keccak, vhash ); dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 ); cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 ); @@ -282,13 +280,13 @@ void lyra2rev2_8way_hash( void *state, const void *input ) LYRA2REV2( l2v2_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 256 ); - skein256_4way_update( &ctx.skein, vhash, 32 ); - skein256_4way_close( &ctx.skein, vhash ); + skein256_4x64_update( &ctx.skein, vhash, 32 ); + skein256_4x64_close( &ctx.skein, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 256 ); intrlv_4x64( vhash, hash4, hash5, hash6, hash7, 256 ); - skein256_4way_init( &ctx.skein ); - skein256_4way_update( &ctx.skein, vhash, 32 ); - skein256_4way_close( &ctx.skein, vhash ); + skein256_4x64_init( &ctx.skein ); + skein256_4x64_update( &ctx.skein, vhash, 32 ); + skein256_4x64_close( &ctx.skein, vhash ); dintrlv_4x64( hash4, hash5, hash6, hash7, vhash, 256 ); cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 ); @@ -303,8 +301,8 @@ void lyra2rev2_8way_hash( void *state, const void *input ) intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 256 ); - bmw256_8way_update( &ctx.bmw, vhash, 32 ); - bmw256_8way_close( &ctx.bmw, state ); + bmw256_8x32_update( &ctx.bmw, vhash, 32 ); + bmw256_8x32_close( &ctx.bmw, state ); } int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce, @@ -328,8 +326,8 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce, mm256_bswap32_intrlv80_8x32( vdata, pdata ); *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); - blake256_8way_init( &l2v2_8way_ctx.blake ); - blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 ); + blake256_8x32_init( &l2v2_8way_ctx.blake ); + blake256_8x32_update( &l2v2_8way_ctx.blake, vdata, 64 ); do { @@ -356,21 +354,21 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce, #elif defined (LYRA2REV2_4WAY) typedef struct { - blake256_4way_context blake; - keccak256_4way_context keccak; + blake256_4x32_context blake; + keccak256_4x64_context keccak; cubehashParam cube; - skein256_4way_context skein; - bmw256_4way_context bmw; + skein256_4x64_context skein; + bmw256_4x32_context bmw; } lyra2v2_4way_ctx_holder; static lyra2v2_4way_ctx_holder l2v2_4way_ctx; bool init_lyra2rev2_4way_ctx() { - keccak256_4way_init( &l2v2_4way_ctx.keccak ); + keccak256_4x64_init( &l2v2_4way_ctx.keccak ); cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 ); - skein256_4way_init( &l2v2_4way_ctx.skein ); - bmw256_4way_init( &l2v2_4way_ctx.bmw ); + skein256_4x64_init( &l2v2_4way_ctx.skein ); + bmw256_4x32_init( &l2v2_4way_ctx.bmw ); return true; } @@ -385,13 +383,13 @@ void lyra2rev2_4way_hash( void *state, const void *input ) lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) ); - blake256_4way_update( &ctx.blake, input + (64<<2), 16 ); - blake256_4way_close( &ctx.blake, vhash ); + blake256_4x32_update( &ctx.blake, input + (64<<2), 16 ); + blake256_4x32_close( &ctx.blake, vhash ); rintrlv_4x32_4x64( vhash64, vhash, 256 ); - keccak256_4way_update( &ctx.keccak, vhash64, 32 ); - keccak256_4way_close( &ctx.keccak, vhash64 ); + keccak256_4x64_update( &ctx.keccak, vhash64, 32 ); + keccak256_4x64_close( &ctx.keccak, vhash64 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); @@ -410,8 +408,8 @@ void lyra2rev2_4way_hash( void *state, const void *input ) intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 ); - skein256_4way_update( &ctx.skein, vhash64, 32 ); - skein256_4way_close( &ctx.skein, vhash64 ); + skein256_4x64_update( &ctx.skein, vhash64, 32 ); + skein256_4x64_close( &ctx.skein, vhash64 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 ); @@ -426,8 +424,8 @@ void lyra2rev2_4way_hash( void *state, const void *input ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 ); - bmw256_4way_update( &ctx.bmw, vhash, 32 ); - bmw256_4way_close( &ctx.bmw, state ); + bmw256_4x32_update( &ctx.bmw, vhash, 32 ); + bmw256_4x32_close( &ctx.bmw, state ); } int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, @@ -451,8 +449,8 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, v128_bswap32_intrlv80_4x32( vdata, pdata ); - blake256_4way_init( &l2v2_4way_ctx.blake ); - blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 ); + blake256_4x32_init( &l2v2_4way_ctx.blake ); + blake256_4x32_update( &l2v2_4way_ctx.blake, vdata, 64 ); do { diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c index 6443697..8027f1a 100644 --- a/algo/lyra2/lyra2rev3-4way.c +++ b/algo/lyra2/lyra2rev3-4way.c @@ -9,18 +9,18 @@ #if defined (LYRA2REV3_16WAY) typedef struct { - blake256_16way_context blake; + blake256_16x32_context blake; cube_4way_context cube; - bmw256_16way_context bmw; + bmw256_16x32_context bmw; } lyra2v3_16way_ctx_holder; static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx; bool init_lyra2rev3_16way_ctx() { - blake256_16way_init( &l2v3_16way_ctx.blake ); + blake256_16x32_init( &l2v3_16way_ctx.blake ); cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 ); - bmw256_16way_init( &l2v3_16way_ctx.bmw ); + bmw256_16x32_init( &l2v3_16way_ctx.bmw ); return true; } @@ -46,8 +46,8 @@ void lyra2rev3_16way_hash( void *state, const void *input ) lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) ); - blake256_16way_update( &ctx.blake, input + (64*16), 16 ); - blake256_16way_close( &ctx.blake, vhash ); + blake256_16x32_update( &ctx.blake, input + (64*16), 16 ); + blake256_16x32_close( &ctx.blake, vhash ); dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15, @@ -120,8 +120,8 @@ void lyra2rev3_16way_hash( void *state, const void *input ) hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15, 256 ); - bmw256_16way_update( &ctx.bmw, vhash, 32 ); - bmw256_16way_close( &ctx.bmw, state ); + bmw256_16x32_update( &ctx.bmw, vhash, 32 ); + bmw256_16x32_close( &ctx.bmw, state ); } @@ -145,8 +145,8 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce, mm512_bswap32_intrlv80_16x32( vdata, pdata ); - blake256_16way_init( &l2v3_16way_ctx.blake ); - blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 ); + blake256_16x32_init( &l2v3_16way_ctx.blake ); + blake256_16x32_update( &l2v3_16way_ctx.blake, vdata, 64 ); do { @@ -178,18 +178,18 @@ int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce, #elif defined (LYRA2REV3_8WAY) typedef struct { - blake256_8way_context blake; + blake256_8x32_context blake; cubehashParam cube; - bmw256_8way_context bmw; + bmw256_8x32_context bmw; } lyra2v3_8way_ctx_holder; static __thread lyra2v3_8way_ctx_holder l2v3_8way_ctx; bool init_lyra2rev3_8way_ctx() { - blake256_8way_init( &l2v3_8way_ctx.blake ); + blake256_8x32_init( &l2v3_8way_ctx.blake ); cubehashInit( &l2v3_8way_ctx.cube, 256, 16, 32 ); - bmw256_8way_init( &l2v3_8way_ctx.bmw ); + bmw256_8x32_init( &l2v3_8way_ctx.bmw ); return true; } @@ -207,8 +207,8 @@ void lyra2rev3_8way_hash( void *state, const void *input ) lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) ); - blake256_8way_update( &ctx.blake, input + (64*8), 16 ); - blake256_8way_close( &ctx.blake, vhash ); + blake256_8x32_update( &ctx.blake, input + (64*8), 16 ); + blake256_8x32_close( &ctx.blake, vhash ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 256 ); @@ -243,8 +243,8 @@ void lyra2rev3_8way_hash( void *state, const void *input ) intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, 256 ); - bmw256_8way_update( &ctx.bmw, vhash, 32 ); - bmw256_8way_close( &ctx.bmw, state ); + bmw256_8x32_update( &ctx.bmw, vhash, 32 ); + bmw256_8x32_close( &ctx.bmw, state ); } @@ -269,8 +269,8 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, mm256_bswap32_intrlv80_8x32( vdata, pdata ); *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); - blake256_8way_init( &l2v3_8way_ctx.blake ); - blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 ); + blake256_8x32_init( &l2v3_8way_ctx.blake ); + blake256_8x32_update( &l2v3_8way_ctx.blake, vdata, 64 ); do { @@ -300,19 +300,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, #if defined (LYRA2REV3_4WAY) typedef struct { - blake256_4way_context blake; + blake256_4x32_context blake; cubehashParam cube; - bmw256_4way_context bmw; + bmw256_4x32_context bmw; } lyra2v3_4way_ctx_holder; -//static lyra2v3_4way_ctx_holder l2v3_4way_ctx; static __thread lyra2v3_4way_ctx_holder l2v3_4way_ctx; bool init_lyra2rev3_4way_ctx() { - blake256_4way_init( &l2v3_4way_ctx.blake ); + blake256_4x32_init( &l2v3_4way_ctx.blake ); cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 ); - bmw256_4way_init( &l2v3_4way_ctx.bmw ); + bmw256_4x32_init( &l2v3_4way_ctx.bmw ); return true; } @@ -326,8 +325,8 @@ void lyra2rev3_4way_hash( void *state, const void *input ) lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) ); - blake256_4way_update( &ctx.blake, input + (64*4), 16 ); - blake256_4way_close( &ctx.blake, vhash ); + blake256_4x32_update( &ctx.blake, input + (64*4), 16 ); + blake256_4x32_close( &ctx.blake, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 ); @@ -349,8 +348,8 @@ void lyra2rev3_4way_hash( void *state, const void *input ) LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 ); intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 ); - bmw256_4way_update( &ctx.bmw, vhash, 32 ); - bmw256_4way_close( &ctx.bmw, state ); + bmw256_4x32_update( &ctx.bmw, vhash, 32 ); + bmw256_4x32_close( &ctx.bmw, state ); } int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce, @@ -374,8 +373,8 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce, v128_bswap32_intrlv80_4x32( vdata, pdata ); *noncev = _mm_set_epi32( n+3, n+2, n+1, n ); - blake256_4way_init( &l2v3_4way_ctx.blake ); - blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 ); + blake256_4x32_init( &l2v3_4way_ctx.blake ); + blake256_4x32_update( &l2v3_4way_ctx.blake, vdata, 64 ); do { diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 203e252..dd6ebd4 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -45,7 +45,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars, uint32_t hash14[8] __attribute__ ((aligned (32))); uint32_t hash15[8] __attribute__ ((aligned (32))); - blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); + blake256_16x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15, @@ -139,7 +139,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); // Partialy prehash second block without touching nonces in block_buf[3]. - blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { lyra2z_16way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -180,7 +180,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars, uint32_t hash7[8] __attribute__ ((aligned (32))); uint32_t vhash[8*8] __attribute__ ((aligned (64))); - blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); + blake256_8x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, 256 ); @@ -246,7 +246,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce, _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n ); // Partialy prehash second block without touching nonces - blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { lyra2z_8way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -279,12 +279,12 @@ bool lyra2z_4way_thread_init() return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) ); } -static __thread blake256_4way_context l2z_4way_blake_mid; +static __thread blake256_4x32_context l2z_4way_blake_mid; void lyra2z_4way_midstate( const void* input ) { - blake256_4way_init( &l2z_4way_blake_mid ); - blake256_4way_update( &l2z_4way_blake_mid, input, 64 ); + blake256_4x32_init( &l2z_4way_blake_mid ); + blake256_4x32_update( &l2z_4way_blake_mid, input, 64 ); } void lyra2z_4way_hash( void *hash, const void *midstate_vars, @@ -295,15 +295,8 @@ void lyra2z_4way_hash( void *hash, const void *midstate_vars, uint32_t hash2[8] __attribute__ ((aligned (64))); uint32_t hash3[8] __attribute__ ((aligned (64))); uint32_t vhash[8*4] __attribute__ ((aligned (64))); -// blake256_4way_context ctx_blake __attribute__ ((aligned (64))); - blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); - -/* - memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid ); - blake256_4way_update( &ctx_blake, input + (64*4), 16 ); - blake256_4way_close( &ctx_blake, vhash ); -*/ + blake256_4x32_final_rounds_le( vhash, midstate_vars, midhash, block, 14 ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 ); @@ -357,7 +350,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce, block_buf[15] = v128_32( 640 ); // Partialy prehash second block without touching nonces - blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf ); + blake256_4x32_round0_prehash_le( midstate_vars, block0_hash, block_buf ); do { lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf ); @@ -454,11 +447,9 @@ bool register_lyra2z_algo( algo_gate_t* gate ) #if defined(LYRA2Z_16WAY) gate->miner_thread_init = (void*)&lyra2z_16way_thread_init; gate->scanhash = (void*)&scanhash_lyra2z_16way; -// gate->hash = (void*)&lyra2z_16way_hash; #elif defined(LYRA2Z_8WAY) gate->miner_thread_init = (void*)&lyra2z_8way_thread_init; gate->scanhash = (void*)&scanhash_lyra2z_8way; -// gate->hash = (void*)&lyra2z_8way_hash; #elif defined(LYRA2Z_4WAY) gate->miner_thread_init = (void*)&lyra2z_4way_thread_init; gate->scanhash = (void*)&scanhash_lyra2z_4way; diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index d23a4e5..7db2d38 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -45,7 +45,7 @@ static const uint64_t blake2b_IV[8] = #if defined(SIMD512) -#define G2W_4X64(a,b,c,d) \ +#define G2W(a,b,c,d) \ a = _mm512_add_epi64( a, b ); \ d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \ c = _mm512_add_epi64( c, d ); \ @@ -56,27 +56,15 @@ static const uint64_t blake2b_IV[8] = b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 ); #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ - G2W_4X64( s0, s1, s2, s3 ); \ + G2W( s0, s1, s2, s3 ); \ s0 = mm512_shufll256_64( s0 ); \ - s3 = mm512_swap256_128( s3); \ + s3 = mm512_swap256_128( s3 ); \ s2 = mm512_shuflr256_64( s2 ); \ - G2W_4X64( s0, s1, s2, s3 ); \ + G2W( s0, s1, s2, s3 ); \ s0 = mm512_shuflr256_64( s0 ); \ s3 = mm512_swap256_128( s3 ); \ s2 = mm512_shufll256_64( s2 ); -/* -#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ - G2W_4X64( s0, s1, s2, s3 ); \ - s3 = mm512_shufll256_64( s3 ); \ - s1 = mm512_shuflr256_64( s1); \ - s2 = mm512_swap256_128( s2 ); \ - G2W_4X64( s0, s1, s2, s3 ); \ - s3 = mm512_shuflr256_64( s3 ); \ - s1 = mm512_shufll256_64( s1 ); \ - s2 = mm512_swap256_128( s2 ); -*/ - #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \ LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \ @@ -95,7 +83,7 @@ static const uint64_t blake2b_IV[8] = #if defined(__AVX2__) -#define G_4X64(a,b,c,d) \ +#define G_AVX2(a,b,c,d) \ a = _mm256_add_epi64( a, b ); \ d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \ c = _mm256_add_epi64( c, d ); \ @@ -107,27 +95,15 @@ static const uint64_t blake2b_IV[8] = // Pivot about s1 instead of s0 reduces latency. #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ - G_4X64( s0, s1, s2, s3 ); \ + G_AVX2( s0, s1, s2, s3 ); \ s0 = mm256_shufll_64( s0 ); \ - s3 = mm256_swap_128( s3); \ + s3 = mm256_swap_128( s3 ); \ s2 = mm256_shuflr_64( s2 ); \ - G_4X64( s0, s1, s2, s3 ); \ + G_AVX2( s0, s1, s2, s3 ); \ s0 = mm256_shuflr_64( s0 ); \ s3 = mm256_swap_128( s3 ); \ s2 = mm256_shufll_64( s2 ); -/* -#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ - G_4X64( s0, s1, s2, s3 ); \ - s3 = mm256_shufll_64( s3 ); \ - s1 = mm256_shuflr_64( s1); \ - s2 = mm256_swap_128( s2 ); \ - G_4X64( s0, s1, s2, s3 ); \ - s3 = mm256_shuflr_64( s3 ); \ - s1 = mm256_shufll_64( s1 ); \ - s2 = mm256_swap_128( s2 ); -*/ - #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ @@ -148,7 +124,7 @@ static const uint64_t blake2b_IV[8] = // process 2 columns in parallel // returns void, all args updated -#define G_2X64(a,b,c,d) \ +#define G_128(a,b,c,d) \ a = v128_add64( a, b ); \ d = v128_ror64xor( d, a, 32 ); \ c = v128_add64( c, d ); \ @@ -161,16 +137,16 @@ static const uint64_t blake2b_IV[8] = #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ { \ v128u64_t t; \ - G_2X64( s0, s2, s4, s6 ); \ - G_2X64( s1, s3, s5, s7 ); \ + G_128( s0, s2, s4, s6 ); \ + G_128( s1, s3, s5, s7 ); \ t = v128_alignr64( s7, s6, 1 ); \ s6 = v128_alignr64( s6, s7, 1 ); \ s7 = t; \ t = v128_alignr64( s2, s3, 1 ); \ s2 = v128_alignr64( s3, s2, 1 ); \ s3 = t; \ - G_2X64( s0, s2, s5, s6 ); \ - G_2X64( s1, s3, s4, s7 ); \ + G_128( s0, s2, s5, s6 ); \ + G_128( s1, s3, s4, s7 ); \ t = v128_alignr64( s6, s7, 1 ); \ s6 = v128_alignr64( s7, s6, 1 ); \ s7 = t; \ diff --git a/algo/panama/panama-hash-4way.h b/algo/panama/panama-hash-4way.h index 4af7442..7becff4 100644 --- a/algo/panama/panama-hash-4way.h +++ b/algo/panama/panama-hash-4way.h @@ -18,11 +18,14 @@ typedef struct { } panama_4way_context __attribute__ ((aligned (64))); void panama_4way_init( void *cc ); - void panama_4way_update( void *cc, const void *data, size_t len ); - void panama_4way_close( void *cc, void *dst ); +#define panama_4x32_context panama_4way_context +#define panama_4x32_init panama_4way_init +#define panama_4x32_update panama_4way_update +#define panama_4x32_close panama_4way_close + #if defined(__AVX2__) typedef struct { @@ -34,10 +37,13 @@ typedef struct { } panama_8way_context __attribute__ ((aligned (128))); void panama_8way_init( void *cc ); - void panama_8way_update( void *cc, const void *data, size_t len ); - void panama_8way_close( void *cc, void *dst ); +#define panama_8x32_context panama_8way_context +#define panama_8x32_init panama_8way_init +#define panama_8x32_update panama_8way_update +#define panama_8x32_close panama_8way_close + #endif #endif diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 2fc58d6..96cc7a8 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -31,20 +31,20 @@ union _hmq1725_8way_context_overlay { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + blake512_8x64_context blake; + bmw512_8x64_context bmw; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_context cube; simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - haval256_5_8way_context haval; + sha512_8x64_context sha512; + haval256_8x32_context haval; #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; @@ -81,7 +81,7 @@ extern void hmq1725_8way_hash(void *state, const void *input) __m512i* vhB = (__m512i*)vhashB; __m512i* vhC = (__m512i*)vhashC; - bmw512_8way_full( &ctx.bmw, vhash, input, 80 ); + bmw512_8x64_full( &ctx.bmw, vhash, input, 80 ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -141,26 +141,26 @@ extern void hmq1725_8way_hash(void *state, const void *input) // B if ( likely( vh_mask & 0xff ) ) - skein512_8way_full( &ctx.skein, vhashB, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhashB, vhash, 64 ); mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); vh_mask = _mm512_testn_epi64_mask( vh[0], vmask ); // A if ( ( vh_mask & 0xff ) != 0xff ) - blake512_8way_full( &ctx.blake, vhashA, vhash, 64 ); + blake512_8x64_full( &ctx.blake, vhashA, vhash, 64 ); // B if ( vh_mask & 0xff ) - bmw512_8way_full( &ctx.bmw, vhashB, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhashB, vhash, 64 ); mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -176,16 +176,16 @@ extern void hmq1725_8way_hash(void *state, const void *input) if ( likely( ( vh_mask & 0xff ) != 0xff ) ) { - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhashA ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhashA ); } if ( likely( vh_mask & 0xff ) ) { - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhashB ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhashB ); } mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); @@ -251,9 +251,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) // B if ( likely( vh_mask & 0xff ) ) { - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhash, 64 ); - haval256_5_8way_close( &ctx.haval, vhash ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhash, 64 ); + haval256_8x32_close( &ctx.haval, vhash ); memset( &vhash[8<<3], 0, 32<<3 ); rintrlv_8x32_8x64( vhashB, vhash, 512 ); } @@ -296,7 +296,7 @@ extern void hmq1725_8way_hash(void *state, const void *input) #endif - blake512_8way_full( &ctx.blake, vhash, vhash, 64 ); + blake512_8x64_full( &ctx.blake, vhash, vhash, 64 ); vh_mask = _mm512_testn_epi64_mask( vh[0], vmask ); @@ -351,9 +351,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask ); - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -429,9 +429,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) rintrlv_8x64_8x32( vhashA, vhash, 512 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhashA, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhashA, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -474,9 +474,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) // B if ( likely( vh_mask & 0xff ) ) { - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhashB ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhashB ); } mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); @@ -509,9 +509,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) #endif - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhash ); vh_mask = _mm512_testn_epi64_mask( vh[0], vmask ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, @@ -522,9 +522,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) { intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhash, 64 ); - haval256_5_8way_close( &ctx.haval, vhash ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhash, 64 ); + haval256_8x32_close( &ctx.haval, vhash ); memset( &vhash[8<<3], 0, 32<<3 ); rintrlv_8x32_8x64( vhashA, vhash, 512 ); } @@ -551,9 +551,9 @@ extern void hmq1725_8way_hash(void *state, const void *input) hash7 ); mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask ); - bmw512_8way_init( &ctx.bmw ); - bmw512_8way_update( &ctx.bmw, vhash, 64 ); - bmw512_8way_close( &ctx.bmw, state ); + bmw512_8x64_init( &ctx.bmw ); + bmw512_8x64_update( &ctx.bmw, vhash, 64 ); + bmw512_8x64_close( &ctx.bmw, state ); } int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce, @@ -605,12 +605,12 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce, union _hmq1725_4way_context_overlay { - blake512_4way_context blake; - bmw512_4way_context bmw; + blake512_4x64_context blake; + bmw512_4x64_context bmw; hashState_groestl groestl; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; hashState_luffa luffa; luffa_2way_context luffa2; cubehashParam cube; @@ -620,12 +620,12 @@ union _hmq1725_4way_context_overlay shavite512_2way_context shavite2; simd_2way_context simd_2way; hashState_echo echo; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; + sha512_4x64_context sha512; + haval256_4x32_context haval; #if defined(__VAES__) groestl512_2way_context groestl2; echo_2way_context echo2; @@ -652,9 +652,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) __m256i* vhA = (__m256i*)vhashA; __m256i* vhB = (__m256i*)vhashB; - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, input, 80 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, input, 80 ); + bmw512_4x64_close( &ctx.bmw, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -686,17 +686,17 @@ extern void hmq1725_4way_hash(void *state, const void *input) // B if ( h_mask & 0xffffffff ) - skein512_4way_full( &ctx.skein, vhashB, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhashB, vhash, 64 ); mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); // second fork, A = blake parallel, B= bmw parallel. @@ -704,13 +704,13 @@ extern void hmq1725_4way_hash(void *state, const void *input) h_mask = _mm256_movemask_epi8( vh_mask ); if ( ( h_mask & 0xffffffff ) != 0xffffffff ) - blake512_4way_full( &ctx.blake, vhashA, vhash, 64 ); + blake512_4x64_full( &ctx.blake, vhashA, vhash, 64 ); if ( h_mask & 0xffffffff ) { - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhashB ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhashB ); } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -733,16 +733,16 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( ( h_mask & 0xffffffff ) != 0xffffffff ) { - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhashA ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhashA ); } if ( h_mask & 0xffffffff ) { - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhashB ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhashB ); } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -778,9 +778,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) // B if ( h_mask & 0xffffffff ) { - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhash ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhash, 64 ); + haval256_4x32_close( &ctx.haval, vhash ); memset( &vhash[8<<2], 0, 32<<2 ); rintrlv_4x32_4x64( vhashB, vhash, 512 ); } @@ -813,7 +813,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) #endif - blake512_4way_full( &ctx.blake, vhash, vhash, 64 ); + blake512_4x64_full( &ctx.blake, vhash, vhash, 64 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -845,9 +845,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -890,9 +890,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -921,9 +921,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( h_mask & 0xffffffff ) { - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhashB ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhashB ); } mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); @@ -950,9 +950,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) #endif - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhash ); // A = haval parallel, B = Whirlpool serial @@ -964,9 +964,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) if ( ( h_mask & 0xffffffff ) != 0xffffffff ) { - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhash ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhash, 64 ); + haval256_4x32_close( &ctx.haval, vhash ); memset( &vhash[8<<2], 0, 32<<2 ); rintrlv_4x32_4x64( vhashA, vhash, 512 ); } @@ -984,9 +984,9 @@ extern void hmq1725_4way_hash(void *state, const void *input) mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, state ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, state ); } int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c index e05892e..6a0640f 100644 --- a/algo/ripemd/lbry-4way.c +++ b/algo/ripemd/lbry-4way.c @@ -13,7 +13,7 @@ #if defined(LBRY_16WAY) -static __thread sha256_16way_context sha256_16w_mid; +static __thread sha256_16x32_context sha256_16w_mid; void lbry_16way_hash( void* output, const void* input ) { @@ -36,17 +36,17 @@ void lbry_16way_hash( void* output, const void* input ) uint32_t _ALIGN(64) h13[32]; uint32_t _ALIGN(64) h14[32]; uint32_t _ALIGN(64) h15[32]; - sha256_16way_context ctx_sha256 __attribute__ ((aligned (64))); - sha512_8way_context ctx_sha512; - ripemd160_16way_context ctx_ripemd; + sha256_16x32_context ctx_sha256 __attribute__ ((aligned (64))); + sha512_8x64_context ctx_sha512; + ripemd160_16x32_context ctx_ripemd; memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) ); - sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL ); - sha256_16way_close( &ctx_sha256, vhashA ); + sha256_16x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL ); + sha256_16x32_close( &ctx_sha256, vhashA ); - sha256_16way_init( &ctx_sha256 ); - sha256_16way_update( &ctx_sha256, vhashA, 32 ); - sha256_16way_close( &ctx_sha256, vhashA ); + sha256_16x32_init( &ctx_sha256 ); + sha256_16x32_update( &ctx_sha256, vhashA, 32 ); + sha256_16x32_close( &ctx_sha256, vhashA ); // reinterleave to do sha512 4-way 64 bit twice. dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7, @@ -54,13 +54,13 @@ void lbry_16way_hash( void* output, const void* input ) intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 ); intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 ); - sha512_8way_init( &ctx_sha512 ); - sha512_8way_update( &ctx_sha512, vhashA, 32 ); - sha512_8way_close( &ctx_sha512, vhashA ); + sha512_8x64_init( &ctx_sha512 ); + sha512_8x64_update( &ctx_sha512, vhashA, 32 ); + sha512_8x64_close( &ctx_sha512, vhashA ); - sha512_8way_init( &ctx_sha512 ); - sha512_8way_update( &ctx_sha512, vhashB, 32 ); - sha512_8way_close( &ctx_sha512, vhashB ); + sha512_8x64_init( &ctx_sha512 ); + sha512_8x64_update( &ctx_sha512, vhashB, 32 ); + sha512_8x64_close( &ctx_sha512, vhashB ); // back to 8-way 32 bit dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 ); @@ -68,22 +68,22 @@ void lbry_16way_hash( void* output, const void* input ) intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, h14, h15, 512 ); - ripemd160_16way_init( &ctx_ripemd ); - ripemd160_16way_update( &ctx_ripemd, vhashA, 32 ); - ripemd160_16way_close( &ctx_ripemd, vhashB ); + ripemd160_16x32_init( &ctx_ripemd ); + ripemd160_16x32_update( &ctx_ripemd, vhashA, 32 ); + ripemd160_16x32_close( &ctx_ripemd, vhashB ); - ripemd160_16way_init( &ctx_ripemd ); - ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 ); - ripemd160_16way_close( &ctx_ripemd, vhashC ); + ripemd160_16x32_init( &ctx_ripemd ); + ripemd160_16x32_update( &ctx_ripemd, vhashA+(8<<4), 32 ); + ripemd160_16x32_close( &ctx_ripemd, vhashC ); - sha256_16way_init( &ctx_sha256 ); - sha256_16way_update( &ctx_sha256, vhashB, 20 ); - sha256_16way_update( &ctx_sha256, vhashC, 20 ); - sha256_16way_close( &ctx_sha256, vhashA ); + sha256_16x32_init( &ctx_sha256 ); + sha256_16x32_update( &ctx_sha256, vhashB, 20 ); + sha256_16x32_update( &ctx_sha256, vhashC, 20 ); + sha256_16x32_close( &ctx_sha256, vhashA ); - sha256_16way_init( &ctx_sha256 ); - sha256_16way_update( &ctx_sha256, vhashA, 32 ); - sha256_16way_close( &ctx_sha256, output ); + sha256_16x32_init( &ctx_sha256 ); + sha256_16x32_update( &ctx_sha256, vhashA, 32 ); + sha256_16x32_close( &ctx_sha256, output ); } int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, @@ -115,8 +115,8 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 ); - sha256_16way_init( &sha256_16w_mid ); - sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE ); + sha256_16x32_init( &sha256_16w_mid ); + sha256_16x32_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE ); do { @@ -144,7 +144,7 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, #elif defined(LBRY_8WAY) -static __thread sha256_8way_context sha256_8w_mid; +static __thread sha256_8x32_context sha256_8w_mid; void lbry_8way_hash( void* output, const void* input ) { @@ -159,52 +159,52 @@ void lbry_8way_hash( void* output, const void* input ) uint32_t _ALIGN(32) h5[32]; uint32_t _ALIGN(32) h6[32]; uint32_t _ALIGN(32) h7[32]; - sha256_8way_context ctx_sha256 __attribute__ ((aligned (64))); - sha512_4way_context ctx_sha512; - ripemd160_8way_context ctx_ripemd; + sha256_8x32_context ctx_sha256 __attribute__ ((aligned (64))); + sha512_4x64_context ctx_sha512; + ripemd160_8x32_context ctx_ripemd; memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) ); - sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL ); - sha256_8way_close( &ctx_sha256, vhashA ); + sha256_8x32_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL ); + sha256_8x32_close( &ctx_sha256, vhashA ); - sha256_8way_init( &ctx_sha256 ); - sha256_8way_update( &ctx_sha256, vhashA, 32 ); - sha256_8way_close( &ctx_sha256, vhashA ); + sha256_8x32_init( &ctx_sha256 ); + sha256_8x32_update( &ctx_sha256, vhashA, 32 ); + sha256_8x32_close( &ctx_sha256, vhashA ); // reinterleave to do sha512 4-way 64 bit twice. dintrlv_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 ); intrlv_4x64( vhashA, h0, h1, h2, h3, 256 ); intrlv_4x64( vhashB, h4, h5, h6, h7, 256 ); - sha512_4way_init( &ctx_sha512 ); - sha512_4way_update( &ctx_sha512, vhashA, 32 ); - sha512_4way_close( &ctx_sha512, vhashA ); + sha512_4x64_init( &ctx_sha512 ); + sha512_4x64_update( &ctx_sha512, vhashA, 32 ); + sha512_4x64_close( &ctx_sha512, vhashA ); - sha512_4way_init( &ctx_sha512 ); - sha512_4way_update( &ctx_sha512, vhashB, 32 ); - sha512_4way_close( &ctx_sha512, vhashB ); + sha512_4x64_init( &ctx_sha512 ); + sha512_4x64_update( &ctx_sha512, vhashB, 32 ); + sha512_4x64_close( &ctx_sha512, vhashB ); // back to 8-way 32 bit dintrlv_4x64( h0, h1, h2, h3, vhashA, 512 ); dintrlv_4x64( h4, h5, h6, h7, vhashB, 512 ); intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 ); - ripemd160_8way_init( &ctx_ripemd ); - ripemd160_8way_update( &ctx_ripemd, vhashA, 32 ); - ripemd160_8way_close( &ctx_ripemd, vhashB ); + ripemd160_8x32_init( &ctx_ripemd ); + ripemd160_8x32_update( &ctx_ripemd, vhashA, 32 ); + ripemd160_8x32_close( &ctx_ripemd, vhashB ); - ripemd160_8way_init( &ctx_ripemd ); - ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 ); - ripemd160_8way_close( &ctx_ripemd, vhashC ); + ripemd160_8x32_init( &ctx_ripemd ); + ripemd160_8x32_update( &ctx_ripemd, vhashA+(8<<3), 32 ); + ripemd160_8x32_close( &ctx_ripemd, vhashC ); - sha256_8way_init( &ctx_sha256 ); - sha256_8way_update( &ctx_sha256, vhashB, 20 ); - sha256_8way_update( &ctx_sha256, vhashC, 20 ); - sha256_8way_close( &ctx_sha256, vhashA ); + sha256_8x32_init( &ctx_sha256 ); + sha256_8x32_update( &ctx_sha256, vhashB, 20 ); + sha256_8x32_update( &ctx_sha256, vhashC, 20 ); + sha256_8x32_close( &ctx_sha256, vhashA ); - sha256_8way_init( &ctx_sha256 ); - sha256_8way_update( &ctx_sha256, vhashA, 32 ); - sha256_8way_close( &ctx_sha256, output ); + sha256_8x32_init( &ctx_sha256 ); + sha256_8x32_update( &ctx_sha256, vhashA, 32 ); + sha256_8x32_close( &ctx_sha256, output ); } int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, @@ -235,8 +235,8 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, intrlv_8x32( vdata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 ); - sha256_8way_init( &sha256_8w_mid ); - sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE ); + sha256_8x32_init( &sha256_8w_mid ); + sha256_8x32_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE ); do { diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c index 664e83a..bcc5b0d 100644 --- a/algo/ripemd/ripemd-hash-4way.c +++ b/algo/ripemd/ripemd-hash-4way.c @@ -57,7 +57,7 @@ do{ \ #define ROUND2(a, b, c, d, e, f, s, r, k) \ RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) -static void ripemd160_4way_round( ripemd160_4way_context *sc ) +static void ripemd160_4x32_round( ripemd160_4x32_context *sc ) { const __m128i *in = (__m128i*)sc->buf; __m128i *h = (__m128i*)sc->val; @@ -249,7 +249,7 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc ) h[0] = tmp; } -void ripemd160_4way_init( ripemd160_4way_context *sc ) +void ripemd160_4x32_init( ripemd160_4x32_context *sc ) { sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 ); sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 ); @@ -259,7 +259,7 @@ void ripemd160_4way_init( ripemd160_4way_context *sc ) sc->count_high = sc->count_low = 0; } -void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data, +void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data, size_t len ) { __m128i *vdata = (__m128i*)data; @@ -281,7 +281,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data, len -= clen; if ( ptr == block_size ) { - ripemd160_4way_round( sc ); + ripemd160_4x32_round( sc ); ptr = 0; } clow = sc->count_low; @@ -292,7 +292,7 @@ void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data, } } -void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ) +void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst ) { unsigned ptr, u; uint32_t low, high; @@ -306,7 +306,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 ); - ripemd160_4way_round( sc ); + ripemd160_4x32_round( sc ); memset_zero_128( sc->buf, pad>>2 ); } else @@ -317,7 +317,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ) low = low << 3; sc->buf[ pad>>2 ] = _mm_set1_epi32( low ); sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high ); - ripemd160_4way_round( sc ); + ripemd160_4x32_round( sc ); for (u = 0; u < 5; u ++) casti_v128u32( dst, u ) = sc->val[u]; } @@ -357,7 +357,7 @@ do{ \ #define ROUND2_8W(a, b, c, d, e, f, s, r, k) \ RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) -static void ripemd160_8way_round( ripemd160_8way_context *sc ) +static void ripemd160_8x32_round( ripemd160_8x32_context *sc ) { const __m256i *in = (__m256i*)sc->buf; __m256i *h = (__m256i*)sc->val; @@ -550,7 +550,7 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc ) } -void ripemd160_8way_init( ripemd160_8way_context *sc ) +void ripemd160_8x32_init( ripemd160_8x32_context *sc ) { sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 ); sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 ); @@ -560,7 +560,7 @@ void ripemd160_8way_init( ripemd160_8way_context *sc ) sc->count_high = sc->count_low = 0; } -void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data, +void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; @@ -582,7 +582,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data, len -= clen; if ( ptr == block_size ) { - ripemd160_8way_round( sc ); + ripemd160_8x32_round( sc ); ptr = 0; } clow = sc->count_low; @@ -593,7 +593,7 @@ void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data, } } -void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ) +void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst ) { unsigned ptr, u; uint32_t low, high; @@ -607,7 +607,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 ); - ripemd160_8way_round( sc ); + ripemd160_8x32_round( sc ); memset_zero_256( sc->buf, pad>>2 ); } else @@ -618,7 +618,7 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ) low = low << 3; sc->buf[ pad>>2 ] = _mm256_set1_epi32( low ); sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high ); - ripemd160_8way_round( sc ); + ripemd160_8x32_round( sc ); for (u = 0; u < 5; u ++) casti_m256i( dst, u ) = sc->val[u]; } @@ -629,7 +629,6 @@ void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ) // RIPEMD-160 16 way - #define F16W_1(x, y, z) \ _mm512_xor_si512( _mm512_xor_si512( x, y ), z ) @@ -659,7 +658,7 @@ do{ \ #define ROUND2_16W(a, b, c, d, e, f, s, r, k) \ RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) -static void ripemd160_16way_round( ripemd160_16way_context *sc ) +static void ripemd160_16x32_round( ripemd160_16x32_context *sc ) { const __m512i *in = (__m512i*)sc->buf; __m512i *h = (__m512i*)sc->val; @@ -851,7 +850,7 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc ) h[0] = tmp; } -void ripemd160_16way_init( ripemd160_16way_context *sc ) +void ripemd160_16x32_init( ripemd160_16x32_context *sc ) { sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 ); sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 ); @@ -861,7 +860,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc ) sc->count_high = sc->count_low = 0; } -void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data, +void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data, size_t len ) { __m512i *vdata = (__m512i*)data; @@ -883,7 +882,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data, len -= clen; if ( ptr == block_size ) { - ripemd160_16way_round( sc ); + ripemd160_16x32_round( sc ); ptr = 0; } clow = sc->count_low; @@ -894,7 +893,7 @@ void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data, } } -void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst ) +void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst ) { unsigned ptr, u; uint32_t low, high; @@ -908,7 +907,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 ); - ripemd160_16way_round( sc ); + ripemd160_16x32_round( sc ); memset_zero_512( sc->buf, pad>>2 ); } else @@ -919,7 +918,7 @@ void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst ) low = low << 3; sc->buf[ pad>>2 ] = _mm512_set1_epi32( low ); sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high ); - ripemd160_16way_round( sc ); + ripemd160_16x32_round( sc ); for (u = 0; u < 5; u ++) casti_m512i( dst, u ) = sc->val[u]; } diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h index 2f0fceb..7a50edf 100644 --- a/algo/ripemd/ripemd-hash-4way.h +++ b/algo/ripemd/ripemd-hash-4way.h @@ -12,12 +12,12 @@ typedef struct __m128i buf[64>>2]; __m128i val[5]; uint32_t count_high, count_low; -} __attribute__ ((aligned (64))) ripemd160_4way_context; +} __attribute__ ((aligned (64))) ripemd160_4x32_context; -void ripemd160_4way_init( ripemd160_4way_context *sc ); -void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data, +void ripemd160_4x32_init( ripemd160_4x32_context *sc ); +void ripemd160_4x32_update( ripemd160_4x32_context *sc, const void *data, size_t len ); -void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ); +void ripemd160_4x32_close( ripemd160_4x32_context *sc, void *dst ); #if defined (__AVX2__) @@ -26,12 +26,12 @@ typedef struct __m256i buf[64>>2]; __m256i val[5]; uint32_t count_high, count_low; -} __attribute__ ((aligned (128))) ripemd160_8way_context; +} __attribute__ ((aligned (128))) ripemd160_8x32_context; -void ripemd160_8way_init( ripemd160_8way_context *sc ); -void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data, +void ripemd160_8x32_init( ripemd160_8x32_context *sc ); +void ripemd160_8x32_update( ripemd160_8x32_context *sc, const void *data, size_t len ); -void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst ); +void ripemd160_8x32_close( ripemd160_8x32_context *sc, void *dst ); #if defined(SIMD512) @@ -40,12 +40,12 @@ typedef struct __m512i buf[64>>2]; __m512i val[5]; uint32_t count_high, count_low; -} __attribute__ ((aligned (128))) ripemd160_16way_context; +} __attribute__ ((aligned (128))) ripemd160_16x32_context; -void ripemd160_16way_init( ripemd160_16way_context *sc ); -void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data, +void ripemd160_16x32_init( ripemd160_16x32_context *sc ); +void ripemd160_16x32_update( ripemd160_16x32_context *sc, const void *data, size_t len ); -void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst ); +void ripemd160_16x32_close( ripemd160_16x32_context *sc, void *dst ); #endif // AVX512 #endif // __AVX2__ diff --git a/algo/scrypt/neoscrypt.c b/algo/scrypt/neoscrypt.c index 35e5672..c57bb77 100644 --- a/algo/scrypt/neoscrypt.c +++ b/algo/scrypt/neoscrypt.c @@ -597,6 +597,45 @@ static void blake2s_compress(blake2s_state *S, const void *buf) { v[13] = S->t[1] ^ blake2s_IV[5]; v[14] = S->f[0] ^ blake2s_IV[6]; v[15] = S->f[1] ^ blake2s_IV[7]; + +#if defined(__SSE2__) || defined(__ARM_NEON) + + v128_t *V = (v128_t*)v; + +#define ROUND( r ) \ + V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \ + m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \ + m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \ + V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \ + V[2] = v128_add32( V[2], V[3] ); \ + V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \ + V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \ + m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \ + m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \ + V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \ + V[2] = v128_add32( V[2], V[3] ); \ + V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \ + V[0] = v128_shufll32( V[0] ); \ + V[3] = v128_swap64( V[3] ); \ + V[2] = v128_shuflr32( V[2] ); \ + V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \ + m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \ + m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \ + V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \ + V[2] = v128_add32( V[2], V[3] ); \ + V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \ + V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \ + m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \ + m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \ + V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \ + V[2] = v128_add32( V[2], V[3] ); \ + V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \ + V[0] = v128_shuflr32( V[0] ); \ + V[3] = v128_swap64( V[3] ); \ + V[2] = v128_shufll32( V[2] ) + +#else + #define G(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2s_sigma[r][2*i+0]]; \ @@ -619,6 +658,9 @@ static void blake2s_compress(blake2s_state *S, const void *buf) { G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ } while(0) + +#endif + ROUND(0); ROUND(1); ROUND(2); diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index 879d2b9..d97a8cb 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -336,7 +336,7 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = { }; */ -static inline void sha256_4way_init_state( void *state ) +static inline void sha256_4x32_init_state( void *state ) { casti_v128( state, 0 ) = v128_32( 0x6A09E667 ); casti_v128( state, 1 ) = v128_32( 0xBB67AE85 ); @@ -359,21 +359,21 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key, memcpy( pad, key + 4*16, 4*16 ); memcpy( pad + 4*4, keypad_4way, 4*48 ); - sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad, + sha256_4x32_transform_le( (v128_t*)ihash, (v128_t*)pad, (const v128_t*)tstate ); - sha256_4way_init_state( tstate ); + sha256_4x32_init_state( tstate ); for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c; - sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad, + sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)pad, (const v128_t*)tstate ); for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; for ( ; i < 4*16; i++ ) pad[i] = 0x36363636; - sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad, + sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)pad, (const v128_t*)tstate ); } @@ -386,7 +386,7 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, uint32_t _ALIGN(16) obuf[4 * 16]; int i, j; - sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt, + sha256_4x32_transform_le( (v128_t*)istate, (v128_t*)salt, (const v128_t*)tstate ); memcpy(ibuf, salt + 4 * 16, 4 * 16); @@ -400,10 +400,10 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, ibuf[4 * 4 + 2] = i + 1; ibuf[4 * 4 + 3] = i + 1; - sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf, + sha256_4x32_transform_le( (v128_t*)obuf, (v128_t*)ibuf, (const v128_t*)istate ); - sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf, + sha256_4x32_transform_le( (v128_t*)ostate2, (v128_t*)obuf, (const v128_t*)ostate ); for ( j = 0; j < 4 * 8; j++ ) @@ -418,9 +418,9 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate, uint32_t _ALIGN(64) buf[4 * 16]; int i; - sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt, + sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)salt, (const v128_t*)tstate ); - sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16), + sha256_4x32_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16), (const v128_t*)tstate ); final[ 0] = v128_32( 0x00000001 ); @@ -431,13 +431,13 @@ static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate, = v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128(); final[15] = v128_32 ( 0x00000620 ); - sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final, + sha256_4x32_transform_le( (v128_t*)tstate, (v128_t*)final, (const v128_t*)tstate ); memcpy(buf, tstate, 4 * 32); memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf, + sha256_4x32_transform_le( (v128_t*)ostate, (v128_t*)buf, (const v128_t*)ostate ); for ( i = 0; i < 4 * 8; i++ ) @@ -467,7 +467,7 @@ static const uint32_t _ALIGN(32) finalblk_8way[8 * 16] = { }; */ -static inline void sha256_8way_init_state( void *state ) +static inline void sha256_8x32_init_state( void *state ) { casti_m256i( state, 0 ) = _mm256_set1_epi32( 0x6A09E667 ); casti_m256i( state, 1 ) = _mm256_set1_epi32( 0xBB67AE85 ); @@ -491,21 +491,21 @@ static inline void HMAC_SHA256_80_init_8way( const uint32_t *key, memset( pad + 8*5, 0x00, 8*40 ); for ( i = 0; i < 8; i++ ) pad[ 8*15 + i ] = 0x00000280; - sha256_8way_transform_le( (__m256i*)ihash, (__m256i*)pad, + sha256_8x32_transform_le( (__m256i*)ihash, (__m256i*)pad, (const __m256i*)tstate ); - sha256_8way_init_state( tstate ); + sha256_8x32_init_state( tstate ); for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; for ( ; i < 8*16; i++ ) pad[i] = 0x5c5c5c5c; - sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)pad, + sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)pad, (const __m256i*)tstate ); for ( i = 0; i < 8*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; for ( ; i < 8*16; i++ ) pad[i] = 0x36363636; - sha256_8way_transform_le( (__m256i*)tstate, (__m256i*)pad, + sha256_8x32_transform_le( (__m256i*)tstate, (__m256i*)pad, (const __m256i*)tstate ); } @@ -518,7 +518,7 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate, uint32_t _ALIGN(32) obuf[8 * 16]; int i, j; - sha256_8way_transform_le( (__m256i*)istate, (__m256i*)salt, + sha256_8x32_transform_le( (__m256i*)istate, (__m256i*)salt, (const __m256i*)tstate ); memcpy( ibuf, salt + 8*16, 8*16 ); @@ -541,10 +541,10 @@ static inline void PBKDF2_SHA256_80_128_8way( const uint32_t *tstate, ibuf[8 * 4 + 6] = i + 1; ibuf[8 * 4 + 7] = i + 1; - sha256_8way_transform_le( (__m256i*)obuf, (__m256i*)ibuf, + sha256_8x32_transform_le( (__m256i*)obuf, (__m256i*)ibuf, (const __m256i*)istate ); - sha256_8way_transform_le( (__m256i*)ostate2, (__m256i*)obuf, + sha256_8x32_transform_le( (__m256i*)ostate2, (__m256i*)obuf, (const __m256i*)ostate ); for ( j = 0; j < 8*8; j++ ) @@ -559,9 +559,9 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate, uint32_t _ALIGN(128) buf[ 8*16 ]; int i; - sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)salt, + sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)salt, (const __m256i*)tstate ); - sha256_8way_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16), + sha256_8x32_transform_be( (__m256i*)tstate, (__m256i*)( salt + 8*16), (const __m256i*)tstate ); final[ 0] = _mm256_set1_epi32( 0x00000001 ); @@ -572,7 +572,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate, = _mm256_setzero_si256(); final[15] = _mm256_set1_epi32 ( 0x00000620 ); - sha256_8way_transform_le( (__m256i*)tstate, final, + sha256_8x32_transform_le( (__m256i*)tstate, final, (const __m256i*)tstate ); memcpy( buf, tstate, 8*32 ); @@ -580,7 +580,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate, memset( buf + 8*9, 0x00, 8*24 ); for ( i = 0; i < 8; i++ ) buf[ 8*15 + i ] = 0x00000300; - sha256_8way_transform_le( (__m256i*)ostate, (__m256i*)buf, + sha256_8x32_transform_le( (__m256i*)ostate, (__m256i*)buf, (const __m256i*)ostate ); for (i = 0; i < 8 * 8; i++) @@ -591,7 +591,7 @@ static inline void PBKDF2_SHA256_128_32_8way( uint32_t *tstate, #if defined(SIMD512) -static inline void sha256_16way_init_state( void *state ) +static inline void sha256_16x32_init_state( void *state ) { casti_m512i( state, 0 ) = _mm512_set1_epi32( 0x6A09E667 ); casti_m512i( state, 1 ) = _mm512_set1_epi32( 0xBB67AE85 ); @@ -615,21 +615,21 @@ static inline void HMAC_SHA256_80_init_16way( const uint32_t *key, memset( pad + 16*5, 0x00, 16*40 ); for ( i = 0; i < 16; i++ ) pad[ 16*15 + i ] = 0x00000280; - sha256_16way_transform_le( (__m512i*)ihash, (__m512i*)pad, + sha256_16x32_transform_le( (__m512i*)ihash, (__m512i*)pad, (const __m512i*)tstate ); - sha256_16way_init_state( tstate ); + sha256_16x32_init_state( tstate ); for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; for ( ; i < 16*16; i++ ) pad[i] = 0x5c5c5c5c; - sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)pad, + sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)pad, (const __m512i*)tstate ); for ( i = 0; i < 16*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; for ( ; i < 16*16; i++ ) pad[i] = 0x36363636; - sha256_16way_transform_le( (__m512i*)tstate, (__m512i*)pad, + sha256_16x32_transform_le( (__m512i*)tstate, (__m512i*)pad, (const __m512i*)tstate ); } @@ -642,7 +642,7 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate, uint32_t _ALIGN(128) ostate2[ 16*8 ]; int i, j; - sha256_16way_transform_le( (__m512i*)istate, (__m512i*)salt, + sha256_16x32_transform_le( (__m512i*)istate, (__m512i*)salt, (const __m512i*)tstate ); memcpy( ibuf, salt + 16*16, 16*16 ); @@ -673,10 +673,10 @@ static inline void PBKDF2_SHA256_80_128_16way( const uint32_t *tstate, ibuf[ 16*4 + 14 ] = i + 1; ibuf[ 16*4 + 15 ] = i + 1; - sha256_16way_transform_le( (__m512i*)obuf, (__m512i*)ibuf, + sha256_16x32_transform_le( (__m512i*)obuf, (__m512i*)ibuf, (const __m512i*)istate ); - sha256_16way_transform_le( (__m512i*)ostate2, (__m512i*)obuf, + sha256_16x32_transform_le( (__m512i*)ostate2, (__m512i*)obuf, (const __m512i*)ostate ); for ( j = 0; j < 16*8; j++ ) @@ -691,9 +691,9 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate, uint32_t _ALIGN(128) buf[ 16*16 ]; int i; - sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)salt, + sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)salt, (const __m512i*)tstate ); - sha256_16way_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16), + sha256_16x32_transform_be( (__m512i*)tstate, (__m512i*)( salt + 16*16), (const __m512i*)tstate ); final[ 0] = _mm512_set1_epi32( 0x00000001 ); @@ -704,7 +704,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate, = _mm512_setzero_si512(); final[15] = _mm512_set1_epi32 ( 0x00000620 ); - sha256_16way_transform_le( (__m512i*)tstate, final, + sha256_16x32_transform_le( (__m512i*)tstate, final, (const __m512i*)tstate ); memcpy( buf, tstate, 16*32 ); @@ -712,7 +712,7 @@ static inline void PBKDF2_SHA256_128_32_16way( uint32_t *tstate, memset( buf + 16*9, 0x00, 16*24 ); for ( i = 0; i < 16; i++ ) buf[ 16*15 + i ] = 0x00000300; - sha256_16way_transform_le( (__m512i*)ostate, (__m512i*)buf, + sha256_16x32_transform_le( (__m512i*)ostate, (__m512i*)buf, (const __m512i*)ostate ); for ( i = 0; i < 16*8; i++ ) diff --git a/algo/sha/hmac-sha256-hash-4way.c b/algo/sha/hmac-sha256-hash-4way.c index 64a82ca..b10aa0c 100644 --- a/algo/sha/hmac-sha256-hash-4way.c +++ b/algo/sha/hmac-sha256-hash-4way.c @@ -31,7 +31,7 @@ #include "hmac-sha256-hash-4way.h" #include "compat.h" -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) // HMAC 4-way SSE2 /** @@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K, /* If Klen > 64, the key is really SHA256(K). */ if ( Klen > 64 ) { - sha256_4way_init( &ctx->ictx ); - sha256_4way_update( &ctx->ictx, K, Klen ); - sha256_4way_close( &ctx->ictx, khash ); + sha256_4x32_init( &ctx->ictx ); + sha256_4x32_update( &ctx->ictx, K, Klen ); + sha256_4x32_close( &ctx->ictx, khash ); K = khash; Klen = 32; } /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ - sha256_4way_init( &ctx->ictx ); + sha256_4x32_init( &ctx->ictx ); memset( pad, 0x36, 64*4 ); for ( i = 0; i < Klen; i++ ) - casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ), - casti_v128u32( K, i ) ); + casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ), + casti_v128u32( K, i ) ); - sha256_4way_update( &ctx->ictx, pad, 64 ); + sha256_4x32_update( &ctx->ictx, pad, 64 ); /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ - sha256_4way_init( &ctx->octx ); + sha256_4x32_init( &ctx->octx ); memset( pad, 0x5c, 64*4 ); for ( i = 0; i < Klen/4; i++ ) - casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ), - casti_v128u32( K, i ) ); - sha256_4way_update( &ctx->octx, pad, 64 ); + casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ), + casti_v128u32( K, i ) ); + sha256_4x32_update( &ctx->octx, pad, 64 ); } /* Add bytes to the HMAC-SHA256 operation. */ @@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in, size_t len ) { /* Feed data to the inner SHA256 operation. */ - sha256_4way_update( &ctx->ictx, in, len ); + sha256_4x32_update( &ctx->ictx, in, len ); } /* Finish an HMAC-SHA256 operation. */ @@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest ) unsigned char ihash[32*4] __attribute__ ((aligned (64))); /* Finish the inner SHA256 operation. */ - sha256_4way_close( &ctx->ictx, ihash ); + sha256_4x32_close( &ctx->ictx, ihash ); /* Feed the inner hash to the outer SHA256 operation. */ - sha256_4way_update( &ctx->octx, ihash, 32 ); + sha256_4x32_update( &ctx->octx, ihash, 32 ); /* Finish the outer SHA256 operation. */ - sha256_4way_close( &ctx->octx, digest ); + sha256_4x32_close( &ctx->octx, digest ); } /** @@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen, hmac_sha256_4way_context PShctx, hctx; uint8_t _ALIGN(128) T[32*4]; uint8_t _ALIGN(128) U[32*4]; - __m128i ivec; + v128u32_t ivec; size_t i, clen; uint64_t j; int k; @@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen, for ( i = 0; i * 32 < dkLen; i++ ) { /* Generate INT(i + 1). */ - ivec = _mm_set1_epi32( bswap_32( i+1 ) ); + ivec = v128_32( bswap_32( i+1 ) ); /* Compute U_1 = PRF(P, S || INT(i)). */ memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) ); @@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen, /* ... xor U_j ... */ for ( k = 0; k < 8; k++ ) - casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ), - casti_v128u32( U, k ) ); + casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ), + casti_v128u32( U, k ) ); } /* Copy as many bytes as necessary into buf. */ @@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K, /* If Klen > 64, the key is really SHA256(K). */ if ( Klen > 64 ) { - sha256_8way_init( &ctx->ictx ); - sha256_8way_update( &ctx->ictx, K, Klen ); - sha256_8way_close( &ctx->ictx, khash ); + sha256_8x32_init( &ctx->ictx ); + sha256_8x32_update( &ctx->ictx, K, Klen ); + sha256_8x32_close( &ctx->ictx, khash ); K = khash; Klen = 32; } /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ - sha256_8way_init( &ctx->ictx ); + sha256_8x32_init( &ctx->ictx ); memset( pad, 0x36, 64*8); for ( i = 0; i < Klen/4; i++ ) casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ), casti_m256i( K, i ) ); - sha256_8way_update( &ctx->ictx, pad, 64 ); + sha256_8x32_update( &ctx->ictx, pad, 64 ); /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ - sha256_8way_init( &ctx->octx ); + sha256_8x32_init( &ctx->octx ); memset( pad, 0x5c, 64*8 ); for ( i = 0; i < Klen/4; i++ ) casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ), casti_m256i( K, i ) ); - sha256_8way_update( &ctx->octx, pad, 64 ); + sha256_8x32_update( &ctx->octx, pad, 64 ); } void @@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in, size_t len ) { /* Feed data to the inner SHA256 operation. */ - sha256_8way_update( &ctx->ictx, in, len ); + sha256_8x32_update( &ctx->ictx, in, len ); } /* Finish an HMAC-SHA256 operation. */ @@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest ) unsigned char ihash[32*8] __attribute__ ((aligned (128))); /* Finish the inner SHA256 operation. */ - sha256_8way_close( &ctx->ictx, ihash ); + sha256_8x32_close( &ctx->ictx, ihash ); /* Feed the inner hash to the outer SHA256 operation. */ - sha256_8way_update( &ctx->octx, ihash, 32 ); + sha256_8x32_update( &ctx->octx, ihash, 32 ); /* Finish the outer SHA256 operation. */ - sha256_8way_close( &ctx->octx, digest ); + sha256_8x32_close( &ctx->octx, digest ); } /** @@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K, /* If Klen > 64, the key is really SHA256(K). */ if ( Klen > 64 ) { - sha256_16way_init( &ctx->ictx ); - sha256_16way_update( &ctx->ictx, K, Klen ); - sha256_16way_close( &ctx->ictx, khash ); + sha256_16x32_init( &ctx->ictx ); + sha256_16x32_update( &ctx->ictx, K, Klen ); + sha256_16x32_close( &ctx->ictx, khash ); K = khash; Klen = 32; } /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ - sha256_16way_init( &ctx->ictx ); + sha256_16x32_init( &ctx->ictx ); memset( pad, 0x36, 64*16 ); for ( i = 0; i < Klen; i++ ) casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ), casti_m512i( K, i ) ); - sha256_16way_update( &ctx->ictx, pad, 64 ); + sha256_16x32_update( &ctx->ictx, pad, 64 ); /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ sha256_16way_init( &ctx->octx ); @@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K, for ( i = 0; i < Klen/4; i++ ) casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ), casti_m512i( K, i ) ); - sha256_16way_update( &ctx->octx, pad, 64 ); + sha256_16x32_update( &ctx->octx, pad, 64 ); } void @@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in, size_t len ) { /* Feed data to the inner SHA256 operation. */ - sha256_16way_update( &ctx->ictx, in, len ); + sha256_16x32_update( &ctx->ictx, in, len ); } /* Finish an HMAC-SHA256 operation. */ @@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest ) unsigned char ihash[32*16] __attribute__ ((aligned (128))); /* Finish the inner SHA256 operation. */ - sha256_16way_close( &ctx->ictx, ihash ); + sha256_16x32_close( &ctx->ictx, ihash ); /* Feed the inner hash to the outer SHA256 operation. */ - sha256_16way_update( &ctx->octx, ihash, 32 ); + sha256_16x32_update( &ctx->octx, ihash, 32 ); /* Finish the outer SHA256 operation. */ - sha256_16way_close( &ctx->octx, digest ); + sha256_16x32_close( &ctx->octx, digest ); } /** diff --git a/algo/sha/hmac-sha256-hash-4way.h b/algo/sha/hmac-sha256-hash-4way.h index 320c27b..c4d9853 100644 --- a/algo/sha/hmac-sha256-hash-4way.h +++ b/algo/sha/hmac-sha256-hash-4way.h @@ -1,6 +1,6 @@ /*- * Copyright 2005,2007,2009 Colin Percival - * Copyright 2020 JayDDee@gmailcom + * Copyright 2020 JayDDee246@gmailcom * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,11 +38,12 @@ #include "simd-utils.h" #include "sha256-hash.h" -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) + typedef struct _hmac_sha256_4way_context { - sha256_4way_context ictx; - sha256_4way_context octx; + sha256_4x32_context ictx; + sha256_4x32_context octx; } hmac_sha256_4way_context; //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] ); @@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t, typedef struct _hmac_sha256_8way_context { - sha256_8way_context ictx; - sha256_8way_context octx; + sha256_8x32_context ictx; + sha256_8x32_context octx; } hmac_sha256_8way_context; //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] ); @@ -88,8 +89,8 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t, typedef struct _hmac_sha256_16way_context { - sha256_16way_context ictx; - sha256_16way_context octx; + sha256_16x32_context ictx; + sha256_16x32_context octx; } hmac_sha256_16way_context; //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] ); diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index bfe4729..a41f6e5 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -30,6 +30,7 @@ static const uint32_t K256[64] = 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; +#if defined(__SSE2__) || defined(__ARM_NEON) // SHA-256 4 way SSE2 #define CHs(X, Y, Z) \ @@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data, v128_store( state_out + 7, H ); } - -# if 0 - -// Working correctly but still slower -int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data, - const v128_t *state_in, const uint32_t *target ) -{ - v128_t A, B, C, D, E, F, G, H, T0, T1, T2; - v128_t vmask, targ, hash; - int t6_mask, flip; - v128_t W[16]; v128_memcpy( W, data, 16 ); - - A = v128_load( state_in ); - B = v128_load( state_in+1 ); - C = v128_load( state_in+2 ); - D = v128_load( state_in+3 ); - E = v128_load( state_in+4 ); - F = v128_load( state_in+5 ); - G = v128_load( state_in+6 ); - H = v128_load( state_in+7 ); - - const v128_t IV7 = H; - const v128_t IV6 = G; - - SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); - SHA256_4X32_MSG_EXPANSION( W ); - SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); - SHA256_4X32_MSG_EXPANSION( W ); - SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); - - W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); - W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] ); - W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); - W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); - W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); - W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); - W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); - W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); - W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); - W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); - W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] ); - W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] ); - W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] ); - - v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); - - SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, 48 ); - SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, 48 ); - SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, 48 ); - SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 3, 48 ); - SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, 48 ); - SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 5, 48 ); - SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 6, 48 ); - SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 7, 48 ); - SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 8, 48 ); - SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 9, 48 ); - - T0 = v128_add32( v128_32( K256[58] ), - v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) ); - B = v128_add32( B, T0 ); - - T1 = v128_add32( v128_32( K256[59] ), - v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) ); - A = v128_add32( A, T1 ); - - T2 = v128_add32( v128_32( K256[60] ), - v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) ); - H = v128_add32( H, T2 ); - - targ = v128_32( target[7] ); - hash = v128_bswap32( v128_add32( H, IV7 ) ); - - flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash ); - - if ( likely( - 0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) )) - return 0; - - t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) ); - - // round 58 part 2 - F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) ); - - // round 61 part 1 - W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] ); - T0 = v128_add32( v128_32( K256[61] ), - v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) ); - G = v128_add32( G, T0 ); - - if ( t6_mask ) - { - targ = v128_and( vmask, v128_32( target[6] ) ); - hash = v128_bswap32( v128_add32( G, IV6 ) ); - - if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) )) - return 0; - else - { - flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash ); - if ( 0 != ( t6_mask & ( flip ^ v128_movmask32( - v128_cmpgt32( hash, targ ) ) ) ) ) - return 0; - else if ( target[6] == 0x80000000 ) - { - if ( 0 == ( t6_mask & v128_movmask32( - v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) ) - return 0; - } - } - } - - // rounds 59 to 61 part 2 - E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) ); - D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) ); - C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) ); - - // rounds 62 & 63 - W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] ); - W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] ); - - SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 ); - SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 ); - - state_out[0] = v128_add32( state_in[0], A ); - state_out[1] = v128_add32( state_in[1], B ); - state_out[2] = v128_add32( state_in[2], C ); - state_out[3] = v128_add32( state_in[3], D ); - state_out[4] = v128_add32( state_in[4], E ); - state_out[5] = v128_add32( state_in[5], F ); - state_out[6] = v128_add32( state_in[6], G ); - state_out[7] = v128_add32( state_in[7], H ); -return 1; -} - -#endif - void sha256_4x32_init( sha256_4x32_context *sc ) { sc->count_high = sc->count_low = 0; @@ -529,29 +394,31 @@ void sha256_4x32_full( void *dst, const void *data, size_t len ) sha256_4x32_close( &ctx, dst ); } +#endif // SSE2 || NEON + #if defined(__AVX2__) // SHA-256 8 way #define BSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \ - mm256_ror_32( x, 13 ) ), \ - mm256_ror_32( x, 22 ) ) + mm256_xor3( mm256_ror_32( x, 2 ), \ + mm256_ror_32( x, 13 ), \ + mm256_ror_32( x, 22 ) ) #define BSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \ - mm256_ror_32( x, 11 ) ), \ - mm256_ror_32( x, 25 ) ) + mm256_xor3( mm256_ror_32( x, 6 ), \ + mm256_ror_32( x, 11 ), \ + mm256_ror_32( x, 25 ) ) #define SSG2_0x(x) \ - _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \ - mm256_ror_32( x, 18 ) ), \ - _mm256_srli_epi32( x, 3 ) ) + mm256_xor3( mm256_ror_32( x, 7 ), \ + mm256_ror_32( x, 18 ), \ + _mm256_srli_epi32( x, 3 ) ) #define SSG2_1x(x) \ - _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \ - mm256_ror_32( x, 19 ) ), \ - _mm256_srli_epi32( x, 10 ) ) + mm256_xor3( mm256_ror_32( x, 17 ), \ + mm256_ror_32( x, 19 ), \ + _mm256_srli_epi32( x, 10 ) ) #define SHA256_8WAY_MEXP( a, b, c, d ) \ mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d ); @@ -574,13 +441,8 @@ void sha256_4x32_full( void *dst, const void *data, size_t len ) W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \ W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); - -// With AVX512VL ternary logic optimizations are available. -// If not optimize by forwarding the result of X^Y in MAJ to the next round -// to avoid recalculating it as Y^Z. This optimization is not applicable -// when MAJ is optimized with ternary logic. - #if defined(VL256) +// AVX512 or AVX10-256 #define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca ) @@ -745,7 +607,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W, } // accepts LE input data -void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, +void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data, const __m256i *state_in ) { __m256i W[16]; @@ -754,7 +616,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, } // Accepts BE input data, need to bswap -void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, +void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data, const __m256i *state_in ) { __m256i W[16]; @@ -764,7 +626,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, } // Aggressive prehashing, LE byte order -void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, +void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X, const __m256i *W, const __m256i *state_in ) { __m256i A, B, C, D, E, F, G, H, T1; @@ -813,7 +675,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, _mm256_store_si256( state_mid + 7, H ); } -void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, +void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data, const __m256i *state_in, const __m256i *state_mid, const __m256i *X ) { __m256i A, B, C, D, E, F, G, H; @@ -914,14 +776,12 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, _mm256_store_si256( state_out + 7, H ); } -int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, +int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, const __m256i *state_in, const uint32_t *target ) { __m256i A, B, C, D, E, F, G, H, T0, T1, T2; __m256i vmask, targ, hash; __m256i W[16]; memcpy_256( W, data, 16 ); - const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); uint8_t flip, t6_mask; A = _mm256_load_si256( state_in ); @@ -1012,7 +872,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, // Got H, test it. targ = v256_32( target[7] ); - hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf ); + hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) ); if ( target[7] ) { flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); @@ -1035,7 +895,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, { // Testing H was inconclusive: hash7 == target7, need to test G targ = _mm256_and_si256( vmask, v256_32( target[6] ) ); - hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf ); + hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) ); if ( likely( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpeq_epi32( hash, targ ) ) ) )) @@ -1083,8 +943,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, return 1; } - -void sha256_8way_init( sha256_8way_context *sc ) +void sha256_8x32_init( sha256_8x32_context *sc ) { sc->count_high = sc->count_low = 0; sc->val[0] = v256_32( sha256_iv[0] ); @@ -1100,7 +959,7 @@ void sha256_8way_init( sha256_8way_context *sc ) // need to handle odd byte length for yespower. // Assume only last update is odd. -void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ) +void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; size_t ptr; @@ -1121,7 +980,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ) len -= clen; if ( ptr == buf_size ) { - sha256_8way_transform_be( sc->val, sc->buf, sc->val ); + sha256_8x32_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -1132,7 +991,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len ) } } -void sha256_8way_close( sha256_8way_context *sc, void *dst ) +void sha256_8x32_close( sha256_8x32_context *sc, void *dst ) { unsigned ptr; uint32_t low, high; @@ -1146,7 +1005,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_8way_transform_be( sc->val, sc->buf, sc->val ); + sha256_8x32_transform_be( sc->val, sc->buf, sc->val ); memset_zero_256( sc->buf, pad >> 2 ); } else @@ -1159,17 +1018,17 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) ); - sha256_8way_transform_be( sc->val, sc->buf, sc->val ); + sha256_8x32_transform_be( sc->val, sc->buf, sc->val ); mm256_block_bswap_32( dst, sc->val ); } -void sha256_8way_full( void *dst, const void *data, size_t len ) +void sha256_8x32_full( void *dst, const void *data, size_t len ) { - sha256_8way_context ctx; - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, data, len ); - sha256_8way_close( &ctx, dst ); + sha256_8x32_context ctx; + sha256_8x32_init( &ctx ); + sha256_8x32_update( &ctx, data, len ); + sha256_8x32_close( &ctx, dst ); } #if defined(SIMD512) @@ -1302,7 +1161,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W, } // accepts LE input data -void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, +void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data, const __m512i *state_in ) { __m512i W[16]; @@ -1311,7 +1170,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, } // Accepts BE input data, need to bswap -void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, +void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data, const __m512i *state_in ) { __m512i W[16]; @@ -1321,7 +1180,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, } // Aggressive prehashing, LE byte order -void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, +void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X, const __m512i *W, const __m512i *state_in ) { __m512i A, B, C, D, E, F, G, H, T1; @@ -1369,7 +1228,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, _mm512_store_si512( state_mid + 7, H ); } -void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, +void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data, const __m512i *state_in, const __m512i *state_mid, const __m512i *X ) { __m512i A, B, C, D, E, F, G, H; @@ -1470,15 +1329,13 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, // returns 0 if hash aborted early and invalid, // returns 1 for completed hash with at least one valid candidate. -int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, +int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, const __m512i *state_in, const uint32_t *target ) { __m512i A, B, C, D, E, F, G, H, hash, targ; __m512i T0, T1, T2; __m512i W[16]; memcpy_512( W, data, 16 ); __mmask16 t6_mask; - const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); A = _mm512_load_si512( state_in ); B = _mm512_load_si512( state_in+1 ); @@ -1588,7 +1445,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, H = _mm512_add_epi32( H, T2 ); // got H, test it against target[7] - hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf ); + hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) ); targ = v512_32( target[7] ); if ( target[7] ) if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) )) @@ -1608,7 +1465,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, // got G, test it against target[6] if indicated if ( (uint16_t)t6_mask ) { - hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf ); + hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) ); targ = v512_32( target[6] ); if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) )) return 0; @@ -1644,7 +1501,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, return 1; } -void sha256_16way_init( sha256_16way_context *sc ) +void sha256_16x32_init( sha256_16x32_context *sc ) { sc->count_high = sc->count_low = 0; sc->val[0] = v512_32( sha256_iv[0] ); @@ -1657,7 +1514,7 @@ void sha256_16way_init( sha256_16way_context *sc ) sc->val[7] = v512_32( sha256_iv[7] ); } -void sha256_16way_update( sha256_16way_context *sc, const void *data, +void sha256_16x32_update( sha256_16x32_context *sc, const void *data, size_t len ) { __m512i *vdata = (__m512i*)data; @@ -1679,7 +1536,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data, len -= clen; if ( ptr == buf_size ) { - sha256_16way_transform_be( sc->val, sc->buf, sc->val ); + sha256_16x32_transform_be( sc->val, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -1690,7 +1547,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data, } } -void sha256_16way_close( sha256_16way_context *sc, void *dst ) +void sha256_16x32_close( sha256_16x32_context *sc, void *dst ) { unsigned ptr; uint32_t low, high; @@ -1704,7 +1561,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst ) if ( ptr > pad ) { memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_16way_transform_be( sc->val, sc->buf, sc->val ); + sha256_16x32_transform_be( sc->val, sc->buf, sc->val ); memset_zero_512( sc->buf, pad >> 2 ); } else @@ -1717,17 +1574,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst ) sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) ); - sha256_16way_transform_be( sc->val, sc->buf, sc->val ); + sha256_16x32_transform_be( sc->val, sc->buf, sc->val ); mm512_block_bswap_32( dst, sc->val ); } -void sha256_16way_full( void *dst, const void *data, size_t len ) +void sha256_16x32_full( void *dst, const void *data, size_t len ) { - sha256_16way_context ctx; - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, data, len ); - sha256_16way_close( &ctx, dst ); + sha256_16x32_context ctx; + sha256_16x32_init( &ctx ); + sha256_16x32_update( &ctx, data, len ); + sha256_16x32_close( &ctx, dst ); } #undef CH diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h index 70c652a..6f1c008 100644 --- a/algo/sha/sha256-hash.h +++ b/algo/sha/sha256-hash.h @@ -180,20 +180,9 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data, int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, const __m256i *state_in, const uint32_t *target ); -// Temporary API during naming transition -#define sha256_8way_context sha256_8x32_context -#define sha256_8way_init sha256_8x32_init -#define sha256_8way_update sha256_8x32_update -#define sha256_8way_close sha256_8x32_close -#define sha256_8way_full sha256_8x32_full -#define sha256_8way_transform_le sha256_8x32_transform_le -#define sha256_8way_transform_be sha256_8x32_transform_be -#define sha256_8way_prehash_3rounds sha256_8x32_prehash_3rounds -#define sha256_8way_final_rounds sha256_8x32_final_rounds -#define sha256_8way_transform_le_short sha256_8x32_transform_le_short - #endif // AVX2 +#if defined(__SSE2__) || defined(__ARM_NEON) // SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON typedef struct @@ -219,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data, int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data, const v128_t *state_in, const uint32_t *target ); -// Temporary API during naming transition -#define sha256_4way_context sha256_4x32_context -#define sha256_4way_init sha256_4x32_init -#define sha256_4way_update sha256_4x32_update -#define sha256_4way_close sha256_4x32_close -#define sha256_4way_full sha256_4x32_full -#define sha256_4way_transform_le sha256_4x32_transform_le -#define sha256_4way_transform_be sha256_4x32_transform_be -#define sha256_4way_prehash_3rounds sha256_4x32_prehash_3rounds -#define sha256_4way_final_rounds sha256_4x32_final_rounds -#define sha256_4way_transform_le_short sha256_4x32_transform_le_short - -#endif +#endif // SSE2 || NEON +#endif // SHA256_HASH_H__ diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c index b0a2a48..cd4c97a 100644 --- a/algo/sha/sha256d-4way.c +++ b/algo/sha/sha256d-4way.c @@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const v128_t shuf_bswap32 = - v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); // hash first 64 byte block of data sha256_transform_le( mstatea, pdata, sha256_iv ); @@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) { - casti_v128( hasha, 0 ) = - _mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 ); - casti_v128( hasha, 1 ) = - _mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 ); + casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) ); + casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) ); if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) { pdata[19] = n; @@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, } if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) { - casti_v128( hashb, 0 ) = - _mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 ); - casti_v128( hashb, 1 ) = - _mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 ); + casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) ); + casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) ); if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) { pdata[19] = n+1; @@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, const int thr_id = mythr->id; const __m512i sixteen = v512_32( 16 ); const bool bench = opt_benchmark; - const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); // prehash first block directly from pdata sha256_transform_le( phash, pdata, sha256_iv ); @@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, buf[15] = v512_32( 80*8 ); // bit count // partially pre-expand & prehash second message block, avoiding the nonces - sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 ); + sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 ); // vectorize IV for second hash istate[0] = v512_32( sha256_iv[0] ); @@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, do { - sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre ); - if ( unlikely( sha256_16way_transform_le_short( + sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre ); + if ( unlikely( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) ) ) { for ( int lane = 0; lane < 16; lane++ ) { extr_lane_16x32( phash, hash32, lane, 256 ); - casti_m256i( phash, 0 ) = - _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); + casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); if ( likely( valid_hash( phash, ptarget ) && !bench ) ) { pdata[19] = n + lane; @@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, const bool bench = opt_benchmark; const __m256i last_byte = v256_32( 0x80000000 ); const __m256i eight = v256_32( 8 ); - const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); for ( int i = 0; i < 19; i++ ) vdata[i] = v256_32( pdata[i] ); @@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, istate[6] = v256_32( sha256_iv[6] ); istate[7] = v256_32( sha256_iv[7] ); - sha256_8way_transform_le( mstate1, vdata, istate ); + sha256_8x32_transform_le( mstate1, vdata, istate ); // Do 3 rounds on the first 12 bytes of the next block - sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 ); + sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 ); do { - sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre ); - if ( unlikely( sha256_8way_transform_le_short( hash32, block, + sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre ); + if ( unlikely( sha256_8x32_transform_le_short( hash32, block, istate, ptarget ) ) ) { for ( int lane = 0; lane < 8; lane++ ) { extr_lane_8x32( lane_hash, hash32, lane, 256 ); casti_m256i( lane_hash, 0 ) = - _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf ); + mm256_bswap_32( casti_m256i( lane_hash, 0 ) ); if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) { pdata[19] = n + lane; diff --git a/algo/sha/sha256d.h b/algo/sha/sha256d.h index d20319a..f6f39a0 100644 --- a/algo/sha/sha256d.h +++ b/algo/sha/sha256d.h @@ -12,7 +12,7 @@ #define SHA256D_NEON_SHA2 1 #elif defined(__AVX2__) #define SHA256D_8WAY 1 -#else +#elif defined(__SSE2__) || defined(__ARM_NEON) #define SHA256D_4WAY 1 #endif diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c index cb67dbd..37824c3 100644 --- a/algo/sha/sha256dt.c +++ b/algo/sha/sha256dt.c @@ -17,7 +17,6 @@ #elif defined (__SSE2__) || defined(__ARM_NEON) #define SHA256DT_4X32 1 #endif -// else ref, should never happen static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) = { @@ -205,8 +204,6 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce, const int thr_id = mythr->id; const __m512i sixteen = v512_32( 16 ); const bool bench = opt_benchmark; - const __m256i bswap_shuf = mm256_bcast_m128( v128_set64( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); // prehash first block directly from pdata sha256_transform_le( phash, pdata, sha256dt_iv ); @@ -258,8 +255,7 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce, for ( int lane = 0; lane < 16; lane++ ) { extr_lane_16x32( phash, hash32, lane, 256 ); - casti_m256i( phash, 0 ) = - _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); + casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); if ( likely( valid_hash( phash, ptarget ) && !bench ) ) { pdata[19] = n + lane; @@ -298,8 +294,6 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce, const bool bench = opt_benchmark; const __m256i last_byte = v256_32( 0x80000000 ); const __m256i eight = v256_32( 8 ); - const __m256i bswap_shuf = mm256_bcast_m128( v128_set64( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); for ( int i = 0; i < 19; i++ ) vdata[i] = v256_32( pdata[i] ); @@ -339,7 +333,7 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce, { extr_lane_8x32( lane_hash, hash32, lane, 256 ); casti_m256i( lane_hash, 0 ) = - _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf ); + mm256_bswap_32( casti_m256i( lane_hash, 0 ) ); if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) { pdata[19] = n + lane; @@ -406,7 +400,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce, do { sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre ); -// sha256_4x32_transform_le( block, vdata+16, mhash1 ); sha256_4x32_transform_le( hash32, block, iv ); for ( int lane = 0; lane < 4; lane++ ) diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c index abfe5a1..cbd6e2c 100644 --- a/algo/sha/sha256q-4way.c +++ b/algo/sha/sha256q-4way.c @@ -7,28 +7,28 @@ #if defined(SHA256T_16WAY) -static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64))); +static __thread sha256_16x32_context sha256_ctx16 __attribute__ ((aligned (64))); void sha256q_16way_hash( void* output, const void* input ) { uint32_t vhash[8*16] __attribute__ ((aligned (64))); - sha256_16way_context ctx; + sha256_16x32_context ctx; memcpy( &ctx, &sha256_ctx16, sizeof ctx ); - sha256_16way_update( &ctx, input + (64<<4), 16 ); - sha256_16way_close( &ctx, vhash ); + sha256_16x32_update( &ctx, input + (64<<4), 16 ); + sha256_16x32_close( &ctx, vhash ); - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, vhash, 32 ); - sha256_16way_close( &ctx, vhash ); + sha256_16x32_init( &ctx ); + sha256_16x32_update( &ctx, vhash, 32 ); + sha256_16x32_close( &ctx, vhash ); - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, vhash, 32 ); - sha256_16way_close( &ctx, vhash ); + sha256_16x32_init( &ctx ); + sha256_16x32_update( &ctx, vhash, 32 ); + sha256_16x32_close( &ctx, vhash ); - sha256_16way_init( &ctx ); - sha256_16way_update( &ctx, vhash, 32 ); - sha256_16way_close( &ctx, output ); + sha256_16x32_init( &ctx ); + sha256_16x32_update( &ctx, vhash, 32 ); + sha256_16x32_close( &ctx, output ); } int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce, @@ -51,8 +51,8 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce, mm512_bswap32_intrlv80_16x32( vdata, pdata ); *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); - sha256_16way_init( &sha256_ctx16 ); - sha256_16way_update( &sha256_ctx16, vdata, 64 ); + sha256_16x32_init( &sha256_ctx16 ); + sha256_16x32_update( &sha256_ctx16, vdata, 64 ); do { @@ -80,28 +80,28 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce, #if defined(SHA256T_8WAY) -static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64))); +static __thread sha256_8x32_context sha256_ctx8 __attribute__ ((aligned (64))); void sha256q_8way_hash( void* output, const void* input ) { uint32_t vhash[8*8] __attribute__ ((aligned (64))); - sha256_8way_context ctx; + sha256_8x32_context ctx; memcpy( &ctx, &sha256_ctx8, sizeof ctx ); - sha256_8way_update( &ctx, input + (64<<3), 16 ); - sha256_8way_close( &ctx, vhash ); + sha256_8x32_update( &ctx, input + (64<<3), 16 ); + sha256_8x32_close( &ctx, vhash ); - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, vhash ); + sha256_8x32_init( &ctx ); + sha256_8x32_update( &ctx, vhash, 32 ); + sha256_8x32_close( &ctx, vhash ); - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, vhash ); + sha256_8x32_init( &ctx ); + sha256_8x32_update( &ctx, vhash, 32 ); + sha256_8x32_close( &ctx, vhash ); - sha256_8way_init( &ctx ); - sha256_8way_update( &ctx, vhash, 32 ); - sha256_8way_close( &ctx, output ); + sha256_8x32_init( &ctx ); + sha256_8x32_update( &ctx, vhash, 32 ); + sha256_8x32_close( &ctx, output ); } int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce, @@ -123,8 +123,8 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce, mm256_bswap32_intrlv80_8x32( vdata, pdata ); *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ); - sha256_8way_init( &sha256_ctx8 ); - sha256_8way_update( &sha256_ctx8, vdata, 64 ); + sha256_8x32_init( &sha256_ctx8 ); + sha256_8x32_update( &sha256_ctx8, vdata, 64 ); do { @@ -152,28 +152,28 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce, #if defined(SHA256T_4WAY) -static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64))); +static __thread sha256_4x32_context sha256_ctx4 __attribute__ ((aligned (64))); void sha256q_4way_hash( void* output, const void* input ) { uint32_t vhash[8*4] __attribute__ ((aligned (64))); - sha256_4way_context ctx; + sha256_4x32_context ctx; memcpy( &ctx, &sha256_ctx4, sizeof ctx ); - sha256_4way_update( &ctx, input + (64<<2), 16 ); - sha256_4way_close( &ctx, vhash ); + sha256_4x32_update( &ctx, input + (64<<2), 16 ); + sha256_4x32_close( &ctx, vhash ); - sha256_4way_init( &ctx ); - sha256_4way_update( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, vhash ); + sha256_4x32_init( &ctx ); + sha256_4x32_update( &ctx, vhash, 32 ); + sha256_4x32_close( &ctx, vhash ); - sha256_4way_init( &ctx ); - sha256_4way_update( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, vhash ); + sha256_4x32_init( &ctx ); + sha256_4x32_update( &ctx, vhash, 32 ); + sha256_4x32_close( &ctx, vhash ); - sha256_4way_init( &ctx ); - sha256_4way_update( &ctx, vhash, 32 ); - sha256_4way_close( &ctx, output ); + sha256_4x32_init( &ctx ); + sha256_4x32_update( &ctx, vhash, 32 ); + sha256_4x32_close( &ctx, output ); } int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, @@ -205,8 +205,8 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, 0 }; v128_bswap32_intrlv80_4x32( vdata, pdata ); - sha256_4way_init( &sha256_ctx4 ); - sha256_4way_update( &sha256_ctx4, vdata, 64 ); + sha256_4x32_init( &sha256_ctx4 ); + sha256_4x32_update( &sha256_ctx4, vdata, 64 ); for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index 9b21b06..4357e6b 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -35,8 +35,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, const int thr_id = mythr->id; const __m512i sixteen = v512_32( 16 ); const bool bench = opt_benchmark; - const __m256i bswap_shuf = mm256_bcast_m128( v128_set64( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); // prehash first block directly from pdata sha256_transform_le( phash, pdata, sha256_iv ); @@ -62,7 +60,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, buf[15] = v512_32( 80*8 ); // bit count // partially pre-expand & prehash second message block, avoiding the nonces - sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 ); + sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 ); // vectorize IV for 2nd & 3rd sha256 istate[0] = v512_32( sha256_iv[0] ); @@ -81,18 +79,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, do { - sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre ); + sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre ); - sha256_16way_transform_le( block, block, istate ); + sha256_16x32_transform_le( block, block, istate ); - if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) ) + if ( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) ) { for ( int lane = 0; lane < 16; lane++ ) if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 ) { extr_lane_16x32( phash, hash32, lane, 256 ); - casti_m256i( phash, 0 ) = - _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); + casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); if ( likely( valid_hash( phash, ptarget ) && !bench ) ) { pdata[19] = n + lane; @@ -301,8 +298,6 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, const bool bench = opt_benchmark; const __m256i last_byte = v256_32( 0x80000000 ); const __m256i eight = v256_32( 8 ); - const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x( - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); for ( int i = 0; i < 19; i++ ) vdata[i] = v256_32( pdata[i] ); @@ -327,29 +322,29 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, istate[6] = v256_32( sha256_iv[6] ); istate[7] = v256_32( sha256_iv[7] ); - sha256_8way_transform_le( mstate1, vdata, istate ); + sha256_8x32_transform_le( mstate1, vdata, istate ); // Do 3 rounds on the first 12 bytes of the next block - sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 ); + sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 ); do { // 1. final 16 bytes of data, with padding - sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, + sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre ); // 2. 32 byte hash from 1. - sha256_8way_transform_le( block, block, istate ); + sha256_8x32_transform_le( block, block, istate ); // 3. 32 byte hash from 2. - if ( unlikely( sha256_8way_transform_le_short( + if ( unlikely( sha256_8x32_transform_le_short( hash32, block, istate, ptarget ) ) ) { for ( int lane = 0; lane < 8; lane++ ) { extr_lane_8x32( lane_hash, hash32, lane, 256 ); casti_m256i( lane_hash, 0 ) = - _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf ); + mm256_bswap_32( casti_m256i( lane_hash, 0 ) ); if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) { pdata[19] = n + lane; @@ -419,8 +414,8 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, do { sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre ); - sha256_4way_transform_le( block, block, iv ); - sha256_4way_transform_le( hash32, block, iv ); + sha256_4x32_transform_le( block, block, iv ); + sha256_4x32_transform_le( hash32, block, iv ); for ( int lane = 0; lane < 4; lane++ ) { diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 0f51be7..10fefa8 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -83,15 +83,13 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input, const uint64_t *state_in ) { __m256i STATE0, STATE1; - __m256i MSG, TMP, BSWAP64; + __m256i MSG, TMP; __m256i TMSG0, TMSG1, TMSG2, TMSG3; __m256i ABEF_SAVE, CDGH_SAVE; // Load initial values TMP = _mm256_load_si256( (__m256i*) &state_in[0] ); STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] ); - BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, - 0x0001020304050607 ) ); TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF @@ -103,7 +101,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input, // Rounds 0-3 TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) ); - TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 ); + TMSG0 = mm256_bswap_64( TMSG0 ); MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) ); STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128 (MSG ) ); @@ -113,7 +111,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input, // Rounds 4-7 TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) ); - TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 ); + TMSG1 = mm256_bswap_64( TMSG1 ); MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) ); STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); @@ -124,7 +122,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input, // Rounds 8-11 TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) ); - TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 ); + TMSG2 = mm256_bswap_64( TMSG2 ); MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) ); STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); @@ -135,7 +133,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input, // Rounds 12-15 TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) ); - TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 ); + TMSG3 = mm256_bswap_64( TMSG3 ); MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) ); STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, _mm256_castsi256_si128( MSG ) ); @@ -735,8 +733,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst ) unsigned ptr; const int buf_size = 128; const int pad = buf_size - 16; - const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x( - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); ptr = (unsigned)sc->count & (buf_size - 1U); sc->buf[ ptr>>3 ] = v512_64( 0x80 ); @@ -750,10 +746,8 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst ) else memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); - sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8( - v512_64( sc->count >> 61 ), shuff_bswap64 ); - sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8( - v512_64( sc->count << 3 ), shuff_bswap64 ); + sc->buf[ pad >> 3 ] = v512_64( bswap_64( sc->count >> 61 ) ); + sc->buf[ ( pad+8 ) >> 3 ] = v512_64( bswap_64( sc->count << 3 ) ); sha512_8x64_round( sc, sc->buf, sc->val ); mm512_block_bswap_64( dst, sc->val ); @@ -957,8 +951,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst ) unsigned ptr; const int buf_size = 128; const int pad = buf_size - 16; - const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x( - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); ptr = (unsigned)sc->count & (buf_size - 1U); sc->buf[ ptr>>3 ] = v256_64( 0x80 ); @@ -972,10 +964,8 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst ) else memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); - sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8( - v256_64( sc->count >> 61 ), shuff_bswap64 ); - sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8( - v256_64( sc->count << 3 ), shuff_bswap64 ); + sc->buf[ pad >> 3 ] = v256_64( bswap_64( sc->count >> 61 ) ); + sc->buf[ ( pad+8 ) >> 3 ] = v256_64( bswap_64( sc->count << 3 ) ); sha512_4x64_round( sc, sc->buf, sc->val ); mm256_block_bswap_64( dst, sc->val ); @@ -1138,8 +1128,8 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst ) else v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); - sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) ); - sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) ); + sc->buf[ pad >> 3 ] = v128_64( bswap_64( sc->count >> 61 ) ); + sc->buf[ ( pad+8 ) >> 3 ] = v128_64( bswap_64( sc->count << 3 ) ); sha512_2x64_round( sc, sc->buf, sc->val ); v128_block_bswap64( castp_v128u64( dst ), sc->val ); diff --git a/algo/sha/sha512-hash.h b/algo/sha/sha512-hash.h index 0f8cda9..ffd6535 100644 --- a/algo/sha/sha512-hash.h +++ b/algo/sha/sha512-hash.h @@ -36,7 +36,6 @@ typedef struct uint64_t count; bool initialized; } sha512_8x64_context __attribute__ ((aligned (128))); -#define sha512_8way_context sha512_8x64_context void sha512_8x64_init( sha512_8x64_context *sc); void sha512_8x64_update( sha512_8x64_context *sc, const void *data, @@ -45,10 +44,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst ); void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data, size_t len ); -#define sha512_8way_init sha512_8x64_init -#define sha512_8way_update sha512_8x64_update -#define sha512_8way_close sha512_8x64_close - #endif // AVX512 #if defined (__AVX2__) @@ -62,7 +57,6 @@ typedef struct uint64_t count; bool initialized; } sha512_4x64_context __attribute__ ((aligned (64))); -#define sha512_4way_context sha512_4x64_context void sha512_4x64_init( sha512_4x64_context *sc); void sha512_4x64_update( sha512_4x64_context *sc, const void *data, @@ -71,10 +65,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst ); void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data, size_t len ); -#define sha512_4way_init sha512_4x64_init -#define sha512_4way_update sha512_4x64_update -#define sha512_4way_close sha512_4x64_close - #endif // AVX2 typedef struct diff --git a/algo/sha/sha512256d-4way.c b/algo/sha/sha512256d-4way.c index 62edcac..c350eb6 100644 --- a/algo/sha/sha512256d-4way.c +++ b/algo/sha/sha512256d-4way.c @@ -14,7 +14,7 @@ #if defined(SHA512256D_8WAY) -static void sha512256d_8way_init( sha512_8way_context *ctx ) +static void sha512256d_8x64_init( sha512_8x64_context *ctx ) { ctx->count = 0; ctx->initialized = true; @@ -33,7 +33,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce, { uint64_t hash[8*8] __attribute__ ((aligned (128))); uint32_t vdata[20*8] __attribute__ ((aligned (64))); - sha512_8way_context ctx; + sha512_8x64_context ctx; uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint64_t *hash_q3 = &(hash[3*8]); uint32_t *pdata = work->data; @@ -53,13 +53,13 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce, n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); do { - sha512256d_8way_init( &ctx ); - sha512_8way_update( &ctx, vdata, 80 ); - sha512_8way_close( &ctx, hash ); + sha512256d_8x64_init( &ctx ); + sha512_8x64_update( &ctx, vdata, 80 ); + sha512_8x64_close( &ctx, hash ); - sha512256d_8way_init( &ctx ); - sha512_8way_update( &ctx, hash, 32 ); - sha512_8way_close( &ctx, hash ); + sha512256d_8x64_init( &ctx ); + sha512_8x64_update( &ctx, hash, 32 ); + sha512_8x64_close( &ctx, hash ); for ( int lane = 0; lane < 8; lane++ ) if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) ) @@ -82,7 +82,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce, #elif defined(SHA512256D_4WAY) -static void sha512256d_4way_init( sha512_4way_context *ctx ) +static void sha512256d_4x64_init( sha512_4x64_context *ctx ) { ctx->count = 0; ctx->initialized = true; @@ -101,7 +101,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce, { uint64_t hash[8*4] __attribute__ ((aligned (64))); uint32_t vdata[20*4] __attribute__ ((aligned (64))); - sha512_4way_context ctx; + sha512_4x64_context ctx; uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint64_t *hash_q3 = &(hash[3*4]); uint32_t *pdata = work->data; @@ -119,13 +119,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce, n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) ); do { - sha512256d_4way_init( &ctx ); - sha512_4way_update( &ctx, vdata, 80 ); - sha512_4way_close( &ctx, hash ); + sha512256d_4x64_init( &ctx ); + sha512_4x64_update( &ctx, vdata, 80 ); + sha512_4x64_close( &ctx, hash ); - sha512256d_4way_init( &ctx ); - sha512_4way_update( &ctx, hash, 32 ); - sha512_4way_close( &ctx, hash ); + sha512256d_4x64_init( &ctx ); + sha512_4x64_update( &ctx, hash, 32 ); + sha512_4x64_close( &ctx, hash ); for ( int lane = 0; lane < 4; lane++ ) if ( hash_q3[ lane ] <= targ_q3 ) diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index 26b1ccd..c149a56 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -430,9 +430,9 @@ do { \ } while (0) static void -shabal_16way_init( void *cc, unsigned size ) +shabal_16x32_init( void *cc, unsigned size ) { - shabal_16way_context *sc = (shabal_16way_context*)cc; + shabal_16x32_context *sc = (shabal_16x32_context*)cc; if ( size == 512 ) { // copy immediate constants directly to working registers later. @@ -494,9 +494,9 @@ shabal_16way_init( void *cc, unsigned size ) } static void -shabal_16way_core( void *cc, const unsigned char *data, size_t len ) +shabal_16x32_core( void *cc, const unsigned char *data, size_t len ) { - shabal_16way_context *sc = (shabal_16way_context*)cc; + shabal_16x32_context *sc = (shabal_16x32_context*)cc; __m512i *buf; __m512i *vdata = (__m512i*)data; const int buf_size = 64; @@ -544,10 +544,10 @@ shabal_16way_core( void *cc, const unsigned char *data, size_t len ) } static void -shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst, +shabal_16x32_close( void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words ) { - shabal_16way_context *sc = (shabal_16way_context*)cc; + shabal_16x32_context *sc = (shabal_16x32_context*)cc; __m512i *buf; const int buf_size = 64; size_t ptr; @@ -590,52 +590,39 @@ shabal_16way_close( void *cc, unsigned ub, unsigned n, void *dst, } void -shabal256_16way_init( void *cc ) +shabal256_16x32_init( void *cc ) { - shabal_16way_init(cc, 256); + shabal_16x32_init(cc, 256); } void -shabal256_16way_update( void *cc, const void *data, size_t len ) +shabal256_16x32_update( void *cc, const void *data, size_t len ) { - shabal_16way_core( cc, data, len ); + shabal_16x32_core( cc, data, len ); } void -shabal256_16way_close( void *cc, void *dst ) +shabal256_16x32_close( void *cc, void *dst ) { - shabal_16way_close(cc, 0, 0, dst, 8); + shabal_16x32_close(cc, 0, 0, dst, 8); } void -shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ) +shabal512_16x32_init(void *cc) { - shabal_16way_close(cc, ub, n, dst, 8); + shabal_16x32_init(cc, 512); } void -shabal512_16way_init(void *cc) +shabal512_16x32_update(void *cc, const void *data, size_t len) { - shabal_16way_init(cc, 512); + shabal_16x32_core(cc, data, len); } void -shabal512_16way_update(void *cc, const void *data, size_t len) +shabal512_16x32_close(void *cc, void *dst) { - shabal_16way_core(cc, data, len); -} - -void -shabal512_16way_close(void *cc, void *dst) -{ - shabal_16way_close(cc, 0, 0, dst, 16); -} - -void -shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - shabal_16way_close(cc, ub, n, dst, 16); + shabal_16x32_close(cc, 0, 0, dst, 16); } #endif @@ -1031,9 +1018,9 @@ do { \ } while (0) static void -shabal_8way_init( void *cc, unsigned size ) +shabal_8x32_init( void *cc, unsigned size ) { - shabal_8way_context *sc = (shabal_8way_context*)cc; + shabal_8x32_context *sc = (shabal_8x32_context*)cc; if ( size == 512 ) { // copy immediate constants directly to working registers later. @@ -1095,9 +1082,9 @@ shabal_8way_init( void *cc, unsigned size ) } static void -shabal_8way_core( void *cc, const unsigned char *data, size_t len ) +shabal_8x32_core( void *cc, const unsigned char *data, size_t len ) { - shabal_8way_context *sc = (shabal_8way_context*)cc; + shabal_8x32_context *sc = (shabal_8x32_context*)cc; __m256i *buf; __m256i *vdata = (__m256i*)data; const int buf_size = 64; @@ -1146,10 +1133,10 @@ shabal_8way_core( void *cc, const unsigned char *data, size_t len ) } static void -shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst, +shabal_8x32_close( void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words ) { - shabal_8way_context *sc = (shabal_8way_context*)cc; + shabal_8x32_context *sc = (shabal_8x32_context*)cc; __m256i *buf; const int buf_size = 64; size_t ptr; @@ -1192,52 +1179,39 @@ shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst, } void -shabal256_8way_init( void *cc ) +shabal256_8x32_init( void *cc ) { - shabal_8way_init(cc, 256); + shabal_8x32_init(cc, 256); } void -shabal256_8way_update( void *cc, const void *data, size_t len ) +shabal256_8x32_update( void *cc, const void *data, size_t len ) { - shabal_8way_core( cc, data, len ); + shabal_8x32_core( cc, data, len ); } void -shabal256_8way_close( void *cc, void *dst ) +shabal256_8x32_close( void *cc, void *dst ) { - shabal_8way_close(cc, 0, 0, dst, 8); + shabal_8x32_close(cc, 0, 0, dst, 8); } void -shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ) +shabal512_8x32_init(void *cc) { - shabal_8way_close(cc, ub, n, dst, 8); + shabal_8x32_init(cc, 512); } void -shabal512_8way_init(void *cc) +shabal512_8x32_update(void *cc, const void *data, size_t len) { - shabal_8way_init(cc, 512); + shabal_8x32_core(cc, data, len); } void -shabal512_8way_update(void *cc, const void *data, size_t len) +shabal512_8x32_close(void *cc, void *dst) { - shabal_8way_core(cc, data, len); -} - -void -shabal512_8way_close(void *cc, void *dst) -{ - shabal_8way_close(cc, 0, 0, dst, 16); -} - -void -shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - shabal_8way_close(cc, ub, n, dst, 16); + shabal_8x32_close(cc, 0, 0, dst, 16); } #endif // AVX2 @@ -1674,9 +1648,9 @@ static const sph_u32 C_init_512[] = { */ static void -shabal_4way_init( void *cc, unsigned size ) +shabal_4x32_init( void *cc, unsigned size ) { - shabal_4way_context *sc = (shabal_4way_context*)cc; + shabal_4x32_context *sc = (shabal_4x32_context*)cc; if ( size == 512 ) { // copy immediate constants directly to working registers later. @@ -1786,9 +1760,9 @@ shabal_4way_init( void *cc, unsigned size ) } static void -shabal_4way_core( void *cc, const unsigned char *data, size_t len ) +shabal_4x32_core( void *cc, const unsigned char *data, size_t len ) { - shabal_4way_context *sc = (shabal_4way_context*)cc; + shabal_4x32_context *sc = (shabal_4x32_context*)cc; v128_t *buf; v128_t *vdata = (v128_t*)data; const int buf_size = 64; @@ -1838,10 +1812,10 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len ) } static void -shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst, +shabal_4x32_close( void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words ) { - shabal_4way_context *sc = (shabal_4way_context*)cc; + shabal_4x32_context *sc = (shabal_4x32_context*)cc; v128_t *buf; const int buf_size = 64; size_t ptr; @@ -1884,52 +1858,39 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst, } void -shabal256_4way_init( void *cc ) +shabal256_4x32_init( void *cc ) { - shabal_4way_init(cc, 256); + shabal_4x32_init(cc, 256); } void -shabal256_4way_update( void *cc, const void *data, size_t len ) +shabal256_4x32_update( void *cc, const void *data, size_t len ) { - shabal_4way_core( cc, data, len ); + shabal_4x32_core( cc, data, len ); } void -shabal256_4way_close( void *cc, void *dst ) +shabal256_4x32_close( void *cc, void *dst ) { - shabal_4way_close(cc, 0, 0, dst, 8); + shabal_4x32_close(cc, 0, 0, dst, 8); } void -shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ) +shabal512_4x32_init(void *cc) { - shabal_4way_close(cc, ub, n, dst, 8); + shabal_4x32_init(cc, 512); } void -shabal512_4way_init(void *cc) +shabal512_4x32_update(void *cc, const void *data, size_t len) { - shabal_4way_init(cc, 512); + shabal_4x32_core(cc, data, len); } void -shabal512_4way_update(void *cc, const void *data, size_t len) +shabal512_4x32_close(void *cc, void *dst) { - shabal_4way_core(cc, data, len); -} - -void -shabal512_4way_close(void *cc, void *dst) -{ - shabal_4way_close(cc, 0, 0, dst, 16); -} - -void -shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - shabal_4way_close(cc, ub, n, dst, 16); + shabal_4x32_close(cc, 0, 0, dst, 16); } #endif diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index fa51e3d..253155f 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -4,10 +4,6 @@ #include #include "simd-utils.h" -#define SPH_SIZE_shabal256 256 - -#define SPH_SIZE_shabal512 512 - #if defined(SIMD512) typedef struct { @@ -16,22 +12,27 @@ typedef struct { uint32_t Whigh, Wlow; size_t ptr; bool state_loaded; -} shabal_16way_context __attribute__ ((aligned (64))); +} shabal_16x32_context __attribute__ ((aligned (64))); -typedef shabal_16way_context shabal256_16way_context; -typedef shabal_16way_context shabal512_16way_context; +typedef shabal_16x32_context shabal256_16x32_context; +typedef shabal_16x32_context shabal512_16x32_context; -void shabal256_16way_init( void *cc ); -void shabal256_16way_update( void *cc, const void *data, size_t len ); -void shabal256_16way_close( void *cc, void *dst ); -void shabal256_16way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void shabal256_16x32_init( void *cc ); +void shabal256_16x32_update( void *cc, const void *data, size_t len ); +void shabal256_16x32_close( void *cc, void *dst ); -void shabal512_16way_init( void *cc ); -void shabal512_16way_update( void *cc, const void *data, size_t len ); -void shabal512_16way_close( void *cc, void *dst ); -void shabal512_16way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void shabal512_16x32_init( void *cc ); +void shabal512_16x32_update( void *cc, const void *data, size_t len ); +void shabal512_16x32_close( void *cc, void *dst ); + +#define shabal256_16way_context shabal256_16x32_context +#define shabal256_16way_init shabal256_16x32_init +#define shabal256_16way_update shabal256_16x32_update +#define shabal256_16way_close shabal256_16x32_close +#define shabal512_16way_context shabal512_16x32_context +#define shabal512_16way_init shabal512_16x32_init +#define shabal512_16way_update shabal512_16x32_update +#define shabal512_16way_close shabal512_16x32_close #endif @@ -43,22 +44,27 @@ typedef struct { uint32_t Whigh, Wlow; size_t ptr; bool state_loaded; -} shabal_8way_context __attribute__ ((aligned (64))); +} shabal_8x32_context __attribute__ ((aligned (64))); -typedef shabal_8way_context shabal256_8way_context; -typedef shabal_8way_context shabal512_8way_context; +typedef shabal_8x32_context shabal256_8x32_context; +typedef shabal_8x32_context shabal512_8x32_context; -void shabal256_8way_init( void *cc ); -void shabal256_8way_update( void *cc, const void *data, size_t len ); -void shabal256_8way_close( void *cc, void *dst ); -void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void shabal256_8x32_init( void *cc ); +void shabal256_8x32_update( void *cc, const void *data, size_t len ); +void shabal256_8x32_close( void *cc, void *dst ); -void shabal512_8way_init( void *cc ); -void shabal512_8way_update( void *cc, const void *data, size_t len ); -void shabal512_8way_close( void *cc, void *dst ); -void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void shabal512_8x32_init( void *cc ); +void shabal512_8x32_update( void *cc, const void *data, size_t len ); +void shabal512_8x32_close( void *cc, void *dst ); + +#define shabal256_8way_context shabal256_8x32_context +#define shabal256_8way_init shabal256_8x32_init +#define shabal256_8way_update shabal256_8x32_update +#define shabal256_8way_close shabal256_8x32_close +#define shabal512_8way_context shabal512_8x32_context +#define shabal512_8way_init shabal512_8x32_init +#define shabal512_8way_update shabal512_8x32_update +#define shabal512_8way_close shabal512_8x32_close #endif @@ -70,59 +76,29 @@ typedef struct { uint32_t Whigh, Wlow; size_t ptr; bool state_loaded; -} shabal_4way_context; +} shabal_4x32_context; -typedef shabal_4way_context shabal256_4way_context; -typedef shabal_4way_context shabal512_4way_context; +typedef shabal_4x32_context shabal256_4x32_context; +typedef shabal_4x32_context shabal512_4x32_context; -void shabal256_4way_init( void *cc ); -void shabal256_4way_update( void *cc, const void *data, size_t len ); -void shabal256_4way_close( void *cc, void *dst ); -void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void shabal256_4x32_init( void *cc ); +void shabal256_4x32_update( void *cc, const void *data, size_t len ); +void shabal256_4x32_close( void *cc, void *dst ); -void shabal512_4way_init( void *cc ); -void shabal512_4way_update( void *cc, const void *data, size_t len ); -void shabal512_4way_close( void *cc, void *dst ); -void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); +void shabal512_4x32_init( void *cc ); +void shabal512_4x32_update( void *cc, const void *data, size_t len ); +void shabal512_4x32_close( void *cc, void *dst ); + +#define shabal256_4way_context shabal256_4x32_context +#define shabal256_4way_init shabal256_4x32_init +#define shabal256_4way_update shabal256_4x32_update +#define shabal256_4way_close shabal256_4x32_close +#define shabal512_4way_context shabal512_4x32_context +#define shabal512_4way_init shabal512_4x32_init +#define shabal512_4way_update shabal512_4x32_update +#define shabal512_4way_close shabal512_4x32_close #endif -// SSE or NEON - -/* No __mullo_pi32 - -typedef struct -{ - v64_t buf[16] __attribute__ ((aligned (64))); - v64_t A[12], B[16], C[16]; - uint32_t Whigh, Wlow; - size_t ptr; - bool state_loaded; -} shabal_2x32_context; - -typedef shabal_2x32_context shabal256_2x32_context; -typedef shabal_2x32_context shabal512_2x32_context; - -void shabal256_2x32_init( void *cc ); -void shabal256_2x32_update( void *cc, const void *data, size_t len ); -void shabal256_2x32_close( void *cc, void *dst ); -void shabal256_2x32_addbits_and_close( void *cc, unsigned ub, unsigned n, - void *dst ); - -void shabal512_2x32_init( shabal512_2x32_context *cc ); -void shabal512_2x32_update( shabal512_2x32_context *cc, const void *data, - size_t len ); -void shabal512_2x32_close( shabal512_2x32_context *cc, void *dst ); -void shabal512_2x32_addbits_and_close( shabal512_2x32_context *cc, - unsigned ub, unsigned n, void *dst ); -void shabal512_2x32_ctx( shabal512_2x32_context *cc, void *dst, - const void *data, size_t len ); -void shabal512_2x32( shabal512_2x32_context *dst, const void *data, - size_t len ); - -*/ - #endif diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index 4c0be16..08b3fbf 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -6,23 +6,23 @@ #if defined (SKEIN_8WAY) -static __thread skein512_8way_context skein512_8way_ctx +static __thread skein512_8x64_context skein512_8x64_ctx __attribute__ ((aligned (64))); void skeinhash_8way( void *state, const void *input ) { uint64_t vhash64[8*8] __attribute__ ((aligned (128))); - skein512_8way_context ctx_skein; - memcpy( &ctx_skein, &skein512_8way_ctx, sizeof( ctx_skein ) ); + skein512_8x64_context ctx_skein; + memcpy( &ctx_skein, &skein512_8x64_ctx, sizeof( ctx_skein ) ); uint32_t vhash32[16*8] __attribute__ ((aligned (128))); - sha256_8way_context ctx_sha256; + sha256_8x32_context ctx_sha256; - skein512_8way_final16( &ctx_skein, vhash64, input + (64*8) ); + skein512_8x64_final16( &ctx_skein, vhash64, input + (64*8) ); rintrlv_8x64_8x32( vhash32, vhash64, 512 ); - sha256_8way_init( &ctx_sha256 ); - sha256_8way_update( &ctx_sha256, vhash32, 64 ); - sha256_8way_close( &ctx_sha256, state ); + sha256_8x32_init( &ctx_sha256 ); + sha256_8x32_update( &ctx_sha256, vhash32, 64 ); + sha256_8x32_close( &ctx_sha256, state ); } int scanhash_skein_8way( struct work *work, uint32_t max_nonce, @@ -46,7 +46,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce, *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); - skein512_8way_prehash64( &skein512_8way_ctx, vdata ); + skein512_8x64_prehash64( &skein512_8x64_ctx, vdata ); do { skeinhash_8way( hash, vdata ); @@ -73,14 +73,14 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce, #elif defined (SKEIN_4WAY) -static __thread skein512_4way_context skein512_4way_ctx +static __thread skein512_4x64_context skein512_4x64_ctx __attribute__ ((aligned (64))); void skeinhash_4way( void *state, const void *input ) { uint64_t vhash64[8*4] __attribute__ ((aligned (128))); - skein512_4way_context ctx_skein; - memcpy( &ctx_skein, &skein512_4way_ctx, sizeof( ctx_skein ) ); + skein512_4x64_context ctx_skein; + memcpy( &ctx_skein, &skein512_4x64_ctx, sizeof( ctx_skein ) ); #if defined(__SHA__) uint32_t hash0[16] __attribute__ ((aligned (64))); uint32_t hash1[16] __attribute__ ((aligned (64))); @@ -88,10 +88,10 @@ void skeinhash_4way( void *state, const void *input ) uint32_t hash3[16] __attribute__ ((aligned (64))); #else uint32_t vhash32[16*4] __attribute__ ((aligned (64))); - sha256_4way_context ctx_sha256; + sha256_4x32_context ctx_sha256; #endif - skein512_4way_final16( &ctx_skein, vhash64, input + (64*4) ); + skein512_4x64_final16( &ctx_skein, vhash64, input + (64*4) ); #if defined(__SHA__) @@ -107,9 +107,9 @@ void skeinhash_4way( void *state, const void *input ) #else rintrlv_4x64_4x32( vhash32, vhash64, 512 ); - sha256_4way_init( &ctx_sha256 ); - sha256_4way_update( &ctx_sha256, vhash32, 64 ); - sha256_4way_close( &ctx_sha256, state ); + sha256_4x32_init( &ctx_sha256 ); + sha256_4x32_update( &ctx_sha256, vhash32, 64 ); + sha256_4x32_close( &ctx_sha256, state ); #endif } @@ -132,7 +132,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_prehash64( &skein512_4way_ctx, vdata ); + skein512_4x64_prehash64( &skein512_4x64_ctx, vdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index 33677eb..33f3c54 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -513,7 +513,7 @@ do { \ #if defined(SIMD512) -void skein256_8way_init( skein256_8way_context *sc ) +void skein256_8x64_init( skein256_8x64_context *sc ) { sc->h0 = _mm512_set1_epi64( 0xCCD044A12FDB3E13 ); sc->h1 = _mm512_set1_epi64( 0xE83590301A79A9EB ); @@ -527,7 +527,7 @@ void skein256_8way_init( skein256_8way_context *sc ) sc->ptr = 0; } -void skein512_8way_init( skein512_8way_context *sc ) +void skein512_8x64_init( skein512_8x64_context *sc ) { sc->h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE ); sc->h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 ); @@ -542,7 +542,7 @@ void skein512_8way_init( skein512_8way_context *sc ) } static void -skein_big_core_8way( skein512_8way_context *sc, const void *data, +skein_big_core_8x64( skein512_8x64_context *sc, const void *data, size_t len ) { __m512i *vdata = (__m512i*)data; @@ -587,7 +587,7 @@ skein_big_core_8way( skein512_8way_context *sc, const void *data, } static void -skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n, +skein_big_close_8x64( skein512_8x64_context *sc, unsigned ub, unsigned n, void *dst, size_t out_len ) { __m512i *buf; @@ -621,7 +621,7 @@ skein_big_close_8way( skein512_8way_context *sc, unsigned ub, unsigned n, memcpy_512( dst, buf, out_len >> 3 ); } -void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data, +void skein512_8x64_full( skein512_8x64_context *sc, void *out, const void *data, size_t len ) { __m512i h0, h1, h2, h3, h4, h5, h6, h7; @@ -698,7 +698,7 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data, } void -skein512_8way_prehash64( skein512_8way_context *sc, const void *data ) +skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data ) { __m512i *vdata = (__m512i*)data; __m512i *buf = sc->buf; @@ -732,7 +732,7 @@ skein512_8way_prehash64( skein512_8way_context *sc, const void *data ) } void -skein512_8way_final16( skein512_8way_context *sc, void *output, +skein512_8x64_final16( skein512_8x64_context *sc, void *output, const void *data ) { __m512i *in = (__m512i*)data; @@ -778,34 +778,34 @@ skein512_8way_final16( skein512_8way_context *sc, void *output, void -skein256_8way_update(void *cc, const void *data, size_t len) +skein256_8x64_update(void *cc, const void *data, size_t len) { - skein_big_core_8way(cc, data, len); + skein_big_core_8x64(cc, data, len); } void -skein256_8way_close(void *cc, void *dst) +skein256_8x64_close(void *cc, void *dst) { - skein_big_close_8way(cc, 0, 0, dst, 32); + skein_big_close_8x64(cc, 0, 0, dst, 32); } void -skein512_8way_update(void *cc, const void *data, size_t len) +skein512_8x64_update(void *cc, const void *data, size_t len) { - skein_big_core_8way(cc, data, len); + skein_big_core_8x64(cc, data, len); } void -skein512_8way_close(void *cc, void *dst) +skein512_8x64_close(void *cc, void *dst) { - skein_big_close_8way(cc, 0, 0, dst, 64); + skein_big_close_8x64(cc, 0, 0, dst, 64); } #endif // AVX512 #if defined(__AVX2__) -void skein256_4way_init( skein256_4way_context *sc ) +void skein256_4x64_init( skein256_4x64_context *sc ) { sc->h0 = _mm256_set1_epi64x( 0xCCD044A12FDB3E13 ); sc->h1 = _mm256_set1_epi64x( 0xE83590301A79A9EB ); @@ -819,7 +819,7 @@ void skein256_4way_init( skein256_4way_context *sc ) sc->ptr = 0; } -void skein512_4way_init( skein512_4way_context *sc ) +void skein512_4x64_init( skein512_4x64_context *sc ) { sc->h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE ); sc->h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 ); @@ -835,7 +835,7 @@ void skein512_4way_init( skein512_4way_context *sc ) // Do not use for 128 bt data length static void -skein_big_core_4way( skein512_4way_context *sc, const void *data, +skein_big_core_4x64( skein512_4x64_context *sc, const void *data, size_t len ) { __m256i *vdata = (__m256i*)data; @@ -882,7 +882,7 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data, } static void -skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n, +skein_big_close_4x64( skein512_4x64_context *sc, unsigned ub, unsigned n, void *dst, size_t out_len ) { __m256i *buf; @@ -920,7 +920,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n, } void -skein512_4way_full( skein512_4way_context *sc, void *out, const void *data, +skein512_4x64_full( skein512_4x64_context *sc, void *out, const void *data, size_t len ) { __m256i h0, h1, h2, h3, h4, h5, h6, h7; @@ -995,7 +995,7 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data, } void -skein512_4way_prehash64( skein512_4way_context *sc, const void *data ) +skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data ) { __m256i *vdata = (__m256i*)data; __m256i *buf = sc->buf; @@ -1029,7 +1029,7 @@ skein512_4way_prehash64( skein512_4way_context *sc, const void *data ) } void -skein512_4way_final16( skein512_4way_context *sc, void *out, const void *data ) +skein512_4x64_final16( skein512_4x64_context *sc, void *out, const void *data ) { __m256i *vdata = (__m256i*)data; __m256i *buf = sc->buf; @@ -1073,29 +1073,29 @@ skein512_4way_final16( skein512_4way_context *sc, void *out, const void *data ) // Broken for 80 bytes, use prehash. void -skein256_4way_update(void *cc, const void *data, size_t len) +skein256_4x64_update(void *cc, const void *data, size_t len) { - skein_big_core_4way(cc, data, len); + skein_big_core_4x64(cc, data, len); } void -skein256_4way_close(void *cc, void *dst) +skein256_4x64_close(void *cc, void *dst) { - skein_big_close_4way(cc, 0, 0, dst, 32); + skein_big_close_4x64(cc, 0, 0, dst, 32); } // Broken for 80 & 128 bytes, use prehash or full void -skein512_4way_update(void *cc, const void *data, size_t len) +skein512_4x64_update(void *cc, const void *data, size_t len) { - skein_big_core_4way(cc, data, len); + skein_big_core_4x64(cc, data, len); } void -skein512_4way_close(void *cc, void *dst) +skein512_4x64_close(void *cc, void *dst) { - skein_big_close_4way(cc, 0, 0, dst, 64); + skein_big_close_4x64(cc, 0, 0, dst, 64); } #endif // AVX2 @@ -1231,7 +1231,7 @@ void skein512_2x64_init( skein512_2x64_context *sc ) } static void -skein_big_core_2way( skein512_2x64_context *sc, const void *data, +skein_big_core_2x64( skein512_2x64_context *sc, const void *data, size_t len ) { v128u64_t *vdata = (v128u64_t*)data; @@ -1278,7 +1278,7 @@ skein_big_core_2way( skein512_2x64_context *sc, const void *data, } static void -skein_big_close_2way( skein512_2x64_context *sc, unsigned ub, unsigned n, +skein_big_close_2x64( skein512_2x64_context *sc, unsigned ub, unsigned n, void *dst, size_t out_len ) { v128u64_t *buf; @@ -1471,13 +1471,13 @@ skein512_2x64_final16( skein512_2x64_context *sc, void *out, const void *data ) void skein256_2x64_update(void *cc, const void *data, size_t len) { - skein_big_core_2way(cc, data, len); + skein_big_core_2x64(cc, data, len); } void skein256_2x64_close(void *cc, void *dst) { - skein_big_close_2way(cc, 0, 0, dst, 32); + skein_big_close_2x64(cc, 0, 0, dst, 32); } @@ -1485,13 +1485,12 @@ skein256_2x64_close(void *cc, void *dst) void skein512_2x64_update(void *cc, const void *data, size_t len) { - skein_big_core_2way(cc, data, len); + skein_big_core_2x64(cc, data, len); } void skein512_2x64_close(void *cc, void *dst) { - skein_big_close_2way(cc, 0, 0, dst, 64); + skein_big_close_2x64(cc, 0, 0, dst, 64); } - diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index f1f2d42..c09d3ef 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -52,24 +52,36 @@ typedef struct __m512i h0, h1, h2, h3, h4, h5, h6, h7; size_t ptr; uint64_t bcount; -} skein_8way_big_context __attribute__ ((aligned (128))); +} skein_8x64_big_context __attribute__ ((aligned (128))); -typedef skein_8way_big_context skein512_8way_context; -typedef skein_8way_big_context skein256_8way_context; +typedef skein_8x64_big_context skein512_8x64_context; +typedef skein_8x64_big_context skein256_8x64_context; -void skein512_8way_full( skein512_8way_context *sc, void *out, +void skein512_8x64_full( skein512_8x64_context *sc, void *out, const void *data, size_t len ); -void skein512_8way_init( skein512_8way_context *sc ); -void skein512_8way_update( void *cc, const void *data, size_t len ); -void skein512_8way_close( void *cc, void *dst ); +void skein512_8x64_init( skein512_8x64_context *sc ); +void skein512_8x64_update( void *cc, const void *data, size_t len ); +void skein512_8x64_close( void *cc, void *dst ); -void skein512_8way_prehash64( skein512_8way_context *sc, const void *data ); -void skein512_8way_final16( skein512_8way_context *sc, void *out, +void skein512_8x64_prehash64( skein512_8x64_context *sc, const void *data ); +void skein512_8x64_final16( skein512_8x64_context *sc, void *out, const void *data ); -void skein256_8way_init( skein256_8way_context *sc ); -void skein256_8way_update( void *cc, const void *data, size_t len ); -void skein256_8way_close( void *cc, void *dst ); +void skein256_8x64_init( skein256_8x64_context *sc ); +void skein256_8x64_update( void *cc, const void *data, size_t len ); +void skein256_8x64_close( void *cc, void *dst ); + +#define skein512_8way_context skein512_8x64_context +#define skein512_8way_full skein512_8x64_full +#define skein512_8way_init skein512_8x64_init +#define skein512_8way_update skein512_8x64_update +#define skein512_8way_close skein512_8x64_close +#define skein512_8way_prehash64 skein512_8x64_prehash64 +#define skein512_8way_final16 skein512_8x64_final16 +#define skein256_8way_context skein256_8x64_context +#define skein256_8way_init skein256_8x64_init +#define skein256_8way_update skein256_8x64_update +#define skein256_8way_close skein256_8x64_close #endif // AVX512 @@ -81,25 +93,35 @@ typedef struct __m256i h0, h1, h2, h3, h4, h5, h6, h7; size_t ptr; uint64_t bcount; -} skein_4way_big_context __attribute__ ((aligned (128))); +} skein_4x64_big_context __attribute__ ((aligned (128))); -typedef skein_4way_big_context skein512_4way_context; -typedef skein_4way_big_context skein256_4way_context; +typedef skein_4x64_big_context skein512_4x64_context; +typedef skein_4x64_big_context skein256_4x64_context; -void skein512_4way_init( skein512_4way_context *sc ); -void skein512_4way_full( skein512_4way_context *sc, void *out, +void skein512_4x64_init( skein512_4x64_context *sc ); +void skein512_4x64_full( skein512_4x64_context *sc, void *out, const void *data, size_t len ); -void skein512_4way_update( void *cc, const void *data, size_t len ); -void skein512_4way_close( void *cc, void *dst ); - -void skein256_4way_init( skein256_4way_context *sc ); -void skein256_4way_update( void *cc, const void *data, size_t len ); -void skein256_4way_close( void *cc, void *dst ); - -void skein512_4way_prehash64( skein512_4way_context *sc, const void *data ); -void skein512_4way_final16( skein512_4way_context *sc, void *out, +void skein512_4x64_update( void *cc, const void *data, size_t len ); +void skein512_4x64_close( void *cc, void *dst ); +void skein512_4x64_prehash64( skein512_4x64_context *sc, const void *data ); +void skein512_4x64_final16( skein512_4x64_context *sc, void *out, const void *data ); +void skein256_4x64_init( skein256_4x64_context *sc ); +void skein256_4x64_update( void *cc, const void *data, size_t len ); +void skein256_4x64_close( void *cc, void *dst ); + +#define skein512_4way_context skein512_4x64_context +#define skein512_4way_full skein512_4x64_full +#define skein512_4way_init skein512_4x64_init +#define skein512_4way_update skein512_4x64_update +#define skein512_4way_close skein512_4x64_close +#define skein512_4way_prehash64 skein512_4x64_prehash64 +#define skein512_4way_final16 skein512_4x64_final16 +#define skein256_4way_context skein256_4x64_context +#define skein256_4way_init skein256_4x64_init +#define skein256_4way_update skein256_4x64_update +#define skein256_4way_close skein256_4x64_close #endif @@ -109,10 +131,10 @@ typedef struct v128u64_t h0, h1, h2, h3, h4, h5, h6, h7; size_t ptr; uint64_t bcount; -} skein_2way_big_context __attribute__ ((aligned (128))); +} skein_2x64_big_context __attribute__ ((aligned (128))); -typedef skein_2way_big_context skein512_2x64_context; -typedef skein_2way_big_context skein256_2x64_context; +typedef skein_2x64_big_context skein512_2x64_context; +typedef skein_2x64_big_context skein256_2x64_context; void skein512_2x64_init( skein512_2x64_context *sc ); void skein512_2x64_full( skein512_2x64_context *sc, void *out, diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c index 697d64b..188257d 100644 --- a/algo/skein/skein2-4way.c +++ b/algo/skein/skein2-4way.c @@ -21,17 +21,17 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce, __m512i *noncev = (__m512i*)vdata + 9; const int thr_id = mythr->id; const bool bench = opt_benchmark; - skein512_8way_context ctx; + skein512_8x64_context ctx; mm512_bswap32_intrlv80_8x64( vdata, pdata ); *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev ); - skein512_8way_prehash64( &ctx, vdata ); + skein512_8x64_prehash64( &ctx, vdata ); do { - skein512_8way_final16( &ctx, hash, vdata + (16*8) ); - skein512_8way_full( &ctx, hash, hash, 64 ); + skein512_8x64_final16( &ctx, hash, vdata + (16*8) ); + skein512_8x64_full( &ctx, hash, hash, 64 ); for ( int lane = 0; lane < 8; lane++ ) if ( unlikely( hashq3[ lane ] <= targq3 && !bench ) ) @@ -71,16 +71,16 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce, __m256i *noncev = (__m256i*)vdata + 9; const int thr_id = mythr->id; const bool bench = opt_benchmark; - skein512_4way_context ctx; + skein512_4x64_context ctx; mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_prehash64( &ctx, vdata ); + skein512_4x64_prehash64( &ctx, vdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - skein512_4way_final16( &ctx, hash, vdata + (16*4) ); - skein512_4way_full( &ctx, hash, hash, 64 ); + skein512_4x64_final16( &ctx, hash, vdata + (16*4) ); + skein512_4x64_full( &ctx, hash, hash, 64 ); for ( int lane = 0; lane < 4; lane++ ) if ( hash_q3[ lane ] <= targ_q3 ) diff --git a/algo/x16/hex.c b/algo/x16/hex.c index 6be0a05..d2ef99f 100644 --- a/algo/x16/hex.c +++ b/algo/x16/hex.c @@ -189,7 +189,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce, v128_bswap32_80( edata, pdata ); static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t ntime = swab32(pdata[17]); + uint32_t ntime = bswap_32(pdata[17]); if ( s_ntime != ntime ) { hex_getAlgoString( (const uint32_t*) (&edata[1]), x16r_hash_order ); diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 0e62e38..1b88d72 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -31,18 +31,18 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order ) { case JH: mm512_bswap32_intrlv80_8x64( vdata, pdata ); - jh512_8way_init( &x16r_ctx.jh ); - jh512_8way_update( &x16r_ctx.jh, vdata, 64 ); + jh512_8x64_init( &x16r_ctx.jh ); + jh512_8x64_update( &x16r_ctx.jh, vdata, 64 ); break; case KECCAK: mm512_bswap32_intrlv80_8x64( vdata, pdata ); - keccak512_8way_init( &x16r_ctx.keccak ); - keccak512_8way_update( &x16r_ctx.keccak, vdata, 72 ); + keccak512_8x64_init( &x16r_ctx.keccak ); + keccak512_8x64_update( &x16r_ctx.keccak, vdata, 72 ); break; case SKEIN: mm512_bswap32_intrlv80_8x64( vdata, pdata ); - skein512_8way_init( &x16r_ctx.skein ); - skein512_8way_update( &x16r_ctx.skein, vdata, 64 ); + skein512_8x64_init( &x16r_ctx.skein ); + skein512_8x64_update( &x16r_ctx.skein, vdata, 64 ); break; case LUFFA: { @@ -78,8 +78,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order ) break; case HAMSI: mm512_bswap32_intrlv80_8x64( vdata, pdata ); - hamsi512_8way_init( &x16r_ctx.hamsi ); - hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 ); + hamsi512_8x64_init( &x16r_ctx.hamsi ); + hamsi512_8x64_update( &x16r_ctx.hamsi, vdata, 72 ); break; case FUGUE: v128_bswap32_80( edata, pdata ); @@ -90,8 +90,8 @@ void x16r_8way_prehash( void *vdata, void *pdata, const char *hash_order ) break; case SHABAL: mm256_bswap32_intrlv80_8x32( vdata2, pdata ); - shabal512_8way_init( &x16r_ctx.shabal ); - shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 ); + shabal512_8x32_init( &x16r_ctx.shabal ); + shabal512_8x32_update( &x16r_ctx.shabal, vdata2, 64 ); rintrlv_8x32_8x64( vdata, vdata2, 640 ); break; case WHIRLPOOL: @@ -146,27 +146,27 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid, { case BLAKE: if ( i == 0 ) - blake512_8way_full( &ctx.blake, vhash, input, size ); + blake512_8x64_full( &ctx.blake, vhash, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - blake512_8way_full( &ctx.blake, vhash, vhash, size ); + blake512_8x64_full( &ctx.blake, vhash, vhash, size ); } dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case BMW: - bmw512_8way_init( &ctx.bmw ); + bmw512_8x64_init( &ctx.bmw ); if ( i == 0 ) - bmw512_8way_update( &ctx.bmw, input, size ); + bmw512_8x64_update( &ctx.bmw, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - bmw512_8way_update( &ctx.bmw, vhash, size ); + bmw512_8x64_update( &ctx.bmw, vhash, size ); } - bmw512_8way_close( &ctx.bmw, vhash ); + bmw512_8x64_close( &ctx.bmw, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -191,43 +191,43 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid, break; case JH: if ( i == 0 ) - jh512_8way_update( &ctx.jh, input + (64<<3), 16 ); + jh512_8x64_update( &ctx.jh, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, size ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, size ); } - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_close( &ctx.jh, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case KECCAK: if ( i == 0 ) - keccak512_8way_update( &ctx.keccak, input + (72<<3), 8 ); + keccak512_8x64_update( &ctx.keccak, input + (72<<3), 8 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, size ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, size ); } - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_close( &ctx.keccak, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case SKEIN: if ( i == 0 ) - skein512_8way_update( &ctx.skein, input + (64<<3), 16 ); + skein512_8x64_update( &ctx.skein, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, size ); + skein512_8x64_init( &ctx.skein ); + skein512_8x64_update( &ctx.skein, vhash, size ); } - skein512_8way_close( &ctx.skein, vhash ); + skein512_8x64_close( &ctx.skein, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -333,15 +333,15 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid, break; case HAMSI: if ( i == 0 ) - hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 ); + hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, size ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, size ); } - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -388,13 +388,13 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid, intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); if ( i == 0 ) - shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 ); + shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 ); else { - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, size ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, size ); } - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -438,16 +438,16 @@ int x16r_8way_hash_generic( void* output, const void* input, int thrid, } break; case SHA_512: - sha512_8way_init( &ctx.sha512 ); + sha512_8x64_init( &ctx.sha512 ); if ( i == 0 ) - sha512_8way_update( &ctx.sha512, input, size ); + sha512_8x64_update( &ctx.sha512, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - sha512_8way_update( &ctx.sha512, vhash, size ); + sha512_8x64_update( &ctx.sha512, vhash, size ); } - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -556,17 +556,17 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order ) { case JH: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - jh512_4way_init( &x16r_ctx.jh ); - jh512_4way_update( &x16r_ctx.jh, vdata, 64 ); + jh512_4x64_init( &x16r_ctx.jh ); + jh512_4x64_update( &x16r_ctx.jh, vdata, 64 ); break; case KECCAK: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - keccak512_4way_init( &x16r_ctx.keccak ); - keccak512_4way_update( &x16r_ctx.keccak, vdata, 72 ); + keccak512_4x64_init( &x16r_ctx.keccak ); + keccak512_4x64_update( &x16r_ctx.keccak, vdata, 72 ); break; case SKEIN: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_prehash64( &x16r_ctx.skein, vdata ); + skein512_4x64_prehash64( &x16r_ctx.skein, vdata ); break; case LUFFA: { @@ -599,8 +599,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order ) break; case HAMSI: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - hamsi512_4way_init( &x16r_ctx.hamsi ); - hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 ); + hamsi512_4x64_init( &x16r_ctx.hamsi ); + hamsi512_4x64_update( &x16r_ctx.hamsi, vdata, 72 ); break; case FUGUE: v128_bswap32_80( edata, pdata ); @@ -610,8 +610,8 @@ void x16r_4way_prehash( void *vdata, void *pdata, const char *hash_order ) break; case SHABAL: v128_bswap32_intrlv80_4x32( vdata2, pdata ); - shabal512_4way_init( &x16r_ctx.shabal ); - shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 ); + shabal512_4x32_init( &x16r_ctx.shabal ); + shabal512_4x32_update( &x16r_ctx.shabal, vdata2, 64 ); rintrlv_4x32_4x64( vdata, vdata2, 640 ); break; case WHIRLPOOL: @@ -652,24 +652,24 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid, { case BLAKE: if ( i == 0 ) - blake512_4way_full( &ctx.blake, vhash, input, size ); + blake512_4x64_full( &ctx.blake, vhash, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way_full( &ctx.blake, vhash, vhash, size ); + blake512_4x64_full( &ctx.blake, vhash, vhash, size ); } dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case BMW: - bmw512_4way_init( &ctx.bmw ); + bmw512_4x64_init( &ctx.bmw ); if ( i == 0 ) - bmw512_4way_update( &ctx.bmw, input, size ); + bmw512_4x64_update( &ctx.bmw, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way_update( &ctx.bmw, vhash, size ); + bmw512_4x64_update( &ctx.bmw, vhash, size ); } - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case GROESTL: @@ -689,35 +689,35 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid, break; case JH: if ( i == 0 ) - jh512_4way_update( &ctx.jh, input + (64<<2), 16 ); + jh512_4x64_update( &ctx.jh, input + (64<<2), 16 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, size ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, size ); } - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_close( &ctx.jh, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case KECCAK: if ( i == 0 ) - keccak512_4way_update( &ctx.keccak, input + (72<<2), 8 ); + keccak512_4x64_update( &ctx.keccak, input + (72<<2), 8 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, size ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, size ); } - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_close( &ctx.keccak, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case SKEIN: if ( i == 0 ) - skein512_4way_final16( &ctx.skein, vhash, input + (64*4) ); + skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way_full( &ctx.skein, vhash, vhash, size ); + skein512_4x64_full( &ctx.skein, vhash, vhash, size ); } dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; @@ -809,14 +809,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid, break; case HAMSI: if ( i == 0 ) - hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 ); + hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, size ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, size ); } - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: @@ -845,13 +845,13 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid, case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); if ( i == 0 ) - shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 ); + shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 ); else { - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, size ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, size ); } - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); break; case WHIRLPOOL: @@ -878,16 +878,16 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid, } break; case SHA_512: - sha512_4way_init( &ctx.sha512 ); + sha512_4x64_init( &ctx.sha512 ); if ( i == 0 ) - sha512_4way_update( &ctx.sha512, input, size ); + sha512_4x64_update( &ctx.sha512, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, size ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, size ); } - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_close( &ctx.sha512, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; } diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 48d5a6d..5955b41 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -125,19 +125,19 @@ bool register_x21s__algo( algo_gate_t* gate ); union _x16r_8way_context_overlay { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + blake512_8x64_context blake; + bmw512_8x64_context bmw; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_context cube; simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; + sha512_8x64_context sha512; #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; @@ -170,8 +170,8 @@ int scanhash_x16r_8way( struct work *, uint32_t , union _x16r_4way_context_overlay { - blake512_4way_context blake; - bmw512_4way_context bmw; + blake512_4x64_context blake; + bmw512_4x64_context bmw; #if defined(__VAES__) groestl512_2way_context groestl; shavite512_2way_context shavite; @@ -181,17 +181,17 @@ union _x16r_4way_context_overlay shavite512_context shavite; hashState_echo echo; #endif - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; luffa_2way_context luffa; cube_2way_context cube; simd_2way_context simd; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; + sha512_4x64_context sha512; } __attribute__ ((aligned (64))); #define _x16r_4x64_context_overlay _x16r_4way_context_overlay diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c index 292c3c9..add5d86 100644 --- a/algo/x16/x16rt.c +++ b/algo/x16/x16rt.c @@ -20,7 +20,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce, v128_bswap32_80( edata, pdata ); static __thread uint32_t s_ntime = UINT32_MAX; - uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80; + uint32_t masked_ntime = bswap_32( pdata[17] ) & 0xffffff80; if ( s_ntime != masked_ntime ) { x16rt_getTimeHash( masked_ntime, &timeHash ); @@ -28,7 +28,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce, s_ntime = masked_ntime; if ( !thr_id ) applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", - x16r_hash_order, swab32( pdata[17] ), timeHash ); + x16r_hash_order, bswap_32( pdata[17] ), timeHash ); } x16r_prehash( edata, pdata, x16r_hash_order ); diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index b79b4a3..e67172c 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -14,19 +14,19 @@ union _x16rv2_8way_context_overlay { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + blake512_8x64_context blake; + bmw512_8x64_context bmw; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cubehashParam cube; simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; + sha512_8x64_context sha512; sph_tiger_context tiger; #if defined(__VAES__) groestl512_4way_context groestl; @@ -76,29 +76,29 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) switch ( algo ) { case BLAKE: - blake512_8way_init( &ctx.blake ); + blake512_8x64_init( &ctx.blake ); if ( i == 0 ) - blake512_8way_full( &ctx.blake, vhash, input, size ); + blake512_8x64_full( &ctx.blake, vhash, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - blake512_8way_full( &ctx.blake, vhash, vhash, size ); + blake512_8x64_full( &ctx.blake, vhash, vhash, size ); } dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case BMW: - bmw512_8way_init( &ctx.bmw ); + bmw512_8x64_init( &ctx.bmw ); if ( i == 0 ) - bmw512_8way_update( &ctx.bmw, input, size ); + bmw512_8x64_update( &ctx.bmw, input, size ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - bmw512_8way_update( &ctx.bmw, vhash, size ); + bmw512_8x64_update( &ctx.bmw, vhash, size ); } - bmw512_8way_close( &ctx.bmw, vhash ); + bmw512_8x64_close( &ctx.bmw, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -123,15 +123,15 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) break; case JH: if ( i == 0 ) - jh512_8way_update( &ctx.jh, input + (64<<3), 16 ); + jh512_8x64_update( &ctx.jh, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, size ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, size ); } - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_close( &ctx.jh, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -165,30 +165,30 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) } else { - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in0, size ); - sph_tiger_close( &ctx.tiger, hash0 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in1, size ); - sph_tiger_close( &ctx.tiger, hash1 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in2, size ); - sph_tiger_close( &ctx.tiger, hash2 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in3, size ); - sph_tiger_close( &ctx.tiger, hash3 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in4, size ); - sph_tiger_close( &ctx.tiger, hash4 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in5, size ); - sph_tiger_close( &ctx.tiger, hash5 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in6, size ); - sph_tiger_close( &ctx.tiger, hash6 ); - sph_tiger_init( &ctx.tiger ); - sph_tiger( &ctx.tiger, in7, size ); - sph_tiger_close( &ctx.tiger, hash7 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in0, size ); + sph_tiger_close( &ctx.tiger, hash0 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in1, size ); + sph_tiger_close( &ctx.tiger, hash1 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in2, size ); + sph_tiger_close( &ctx.tiger, hash2 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in3, size ); + sph_tiger_close( &ctx.tiger, hash3 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in4, size ); + sph_tiger_close( &ctx.tiger, hash4 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in5, size ); + sph_tiger_close( &ctx.tiger, hash5 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in6, size ); + sph_tiger_close( &ctx.tiger, hash6 ); + sph_tiger_init( &ctx.tiger ); + sph_tiger( &ctx.tiger, in7, size ); + sph_tiger_close( &ctx.tiger, hash7 ); } for ( int i = (24/4); i < (64/4); i++ ) @@ -197,23 +197,23 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; case SKEIN: if ( i == 0 ) - skein512_8way_update( &ctx.skein, input + (64<<3), 16 ); + skein512_8x64_update( &ctx.skein, input + (64<<3), 16 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, size ); + skein512_8x64_init( &ctx.skein ); + skein512_8x64_update( &ctx.skein, vhash, size ); } - skein512_8way_close( &ctx.skein, vhash ); + skein512_8x64_close( &ctx.skein, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -395,16 +395,16 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) break; case HAMSI: if ( i == 0 ) - hamsi512_8way_update( &ctx.hamsi, input + (72<<3), 8 ); + hamsi512_8x64_update( &ctx.hamsi, input + (72<<3), 8 ); else { intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, size ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, size ); } - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -451,13 +451,13 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7, size<<3 ); if ( i == 0 ) - shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 ); + shabal512_8x32_update( &ctx.shabal, vhash + (16<<3), 16 ); else { - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, size ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, size ); } - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -562,9 +562,9 @@ int x16rv2_8way_hash( void* output, const void* input, int thrid ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); break; @@ -623,8 +623,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, { case JH: mm512_bswap32_intrlv80_8x64( vdata, pdata ); - jh512_8way_init( &x16rv2_ctx.jh ); - jh512_8way_update( &x16rv2_ctx.jh, vdata, 64 ); + jh512_8x64_init( &x16rv2_ctx.jh ); + jh512_8x64_update( &x16rv2_ctx.jh, vdata, 64 ); break; case KECCAK: case LUFFA: @@ -637,8 +637,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, break; case SKEIN: mm512_bswap32_intrlv80_8x64( vdata, pdata ); - skein512_8way_init( &x16rv2_ctx.skein ); - skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 ); + skein512_8x64_init( &x16rv2_ctx.skein ); + skein512_8x64_update( &x16rv2_ctx.skein, vdata, 64 ); break; case CUBEHASH: v128_bswap32_80( edata, pdata ); @@ -649,8 +649,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, break; case HAMSI: mm512_bswap32_intrlv80_8x64( vdata, pdata ); - hamsi512_8way_init( &x16rv2_ctx.hamsi ); - hamsi512_8way_update( &x16rv2_ctx.hamsi, vdata, 72 ); + hamsi512_8x64_init( &x16rv2_ctx.hamsi ); + hamsi512_8x64_update( &x16rv2_ctx.hamsi, vdata, 72 ); break; case FUGUE: v128_bswap32_80( edata, pdata ); @@ -661,8 +661,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, break; case SHABAL: mm256_bswap32_intrlv80_8x32( vdata2, pdata ); - shabal512_8way_init( &x16rv2_ctx.shabal ); - shabal512_8way_update( &x16rv2_ctx.shabal, vdata2, 64 ); + shabal512_8x32_init( &x16rv2_ctx.shabal ); + shabal512_8x32_update( &x16rv2_ctx.shabal, vdata2, 64 ); rintrlv_8x32_8x64( vdata, vdata2, 640 ); break; case WHIRLPOOL: @@ -701,8 +701,8 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, union _x16rv2_4way_context_overlay { - blake512_4way_context blake; - bmw512_4way_context bmw; + blake512_4x64_context blake; + bmw512_4x64_context bmw; #if defined(__VAES__) groestl512_2way_context groestl; shavite512_2way_context shavite; @@ -712,17 +712,17 @@ union _x16rv2_4way_context_overlay shavite512_context shavite; hashState_echo echo; #endif - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; luffa_2way_context luffa; cubehashParam cube; simd_2way_context simd; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; + sha512_4x64_context sha512; sph_tiger_context tiger; }; typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay; @@ -761,24 +761,24 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) { case BLAKE: if ( i == 0 ) - blake512_4way_full( &ctx.blake, vhash, input, size ); + blake512_4x64_full( &ctx.blake, vhash, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - blake512_4way_full( &ctx.blake, vhash, vhash, size ); + blake512_4x64_full( &ctx.blake, vhash, vhash, size ); } dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case BMW: - bmw512_4way_init( &ctx.bmw ); + bmw512_4x64_init( &ctx.bmw ); if ( i == 0 ) - bmw512_4way_update( &ctx.bmw, input, size ); + bmw512_4x64_update( &ctx.bmw, input, size ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - bmw512_4way_update( &ctx.bmw, vhash, size ); + bmw512_4x64_update( &ctx.bmw, vhash, size ); } - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case GROESTL: @@ -798,14 +798,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) break; case JH: if ( i == 0 ) - jh512_4way_update( &ctx.jh, input + (64<<2), 16 ); + jh512_4x64_update( &ctx.jh, input + (64<<2), 16 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, size ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, size ); } - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_close( &ctx.jh, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case KECCAK: @@ -842,20 +842,20 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0; intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case SKEIN: if ( i == 0 ) - skein512_4way_final16( &ctx.skein, vhash, input + (64*4) ); + skein512_4x64_final16( &ctx.skein, vhash, input + (64*4) ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - skein512_4way_init( &ctx.skein ); - skein512_4way_update( &ctx.skein, vhash, size ); - skein512_4way_close( &ctx.skein, vhash ); + skein512_4x64_init( &ctx.skein ); + skein512_4x64_update( &ctx.skein, vhash, size ); + skein512_4x64_close( &ctx.skein, vhash ); } dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; @@ -976,14 +976,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) break; case HAMSI: if ( i == 0 ) - hamsi512_4way_update( &ctx.hamsi, input + (72<<2), 8 ); + hamsi512_4x64_update( &ctx.hamsi, input + (72<<2), 8 ); else { intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, size ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, size ); } - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); break; case FUGUE: @@ -1012,13 +1012,13 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) case SHABAL: intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 ); if ( i == 0 ) - shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 ); + shabal512_4x32_update( &ctx.shabal, vhash + (16<<2), 16 ); else { - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, size ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, size ); } - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); break; case WHIRLPOOL: @@ -1078,9 +1078,9 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid ) hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0; intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; } @@ -1133,8 +1133,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, { case JH: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - jh512_4way_init( &x16rv2_ctx.jh ); - jh512_4way_update( &x16rv2_ctx.jh, vdata, 64 ); + jh512_4x64_init( &x16rv2_ctx.jh ); + jh512_4x64_update( &x16rv2_ctx.jh, vdata, 64 ); break; case KECCAK: case LUFFA: @@ -1146,7 +1146,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, break; case SKEIN: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - skein512_4way_prehash64( &x16rv2_ctx.skein, vdata ); + skein512_4x64_prehash64( &x16rv2_ctx.skein, vdata ); break; case CUBEHASH: v128_bswap32_80( edata, pdata ); @@ -1156,8 +1156,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, break; case HAMSI: mm256_bswap32_intrlv80_4x64( vdata, pdata ); - hamsi512_4way_init( &x16rv2_ctx.hamsi ); - hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 72 ); + hamsi512_4x64_init( &x16rv2_ctx.hamsi ); + hamsi512_4x64_update( &x16rv2_ctx.hamsi, vdata, 72 ); break; case FUGUE: v128_bswap32_80( edata, pdata ); @@ -1167,8 +1167,8 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, break; case SHABAL: v128_bswap32_intrlv80_4x32( vdata32, pdata ); - shabal512_4way_init( &x16rv2_ctx.shabal ); - shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 ); + shabal512_4x32_init( &x16rv2_ctx.shabal ); + shabal512_4x32_update( &x16rv2_ctx.shabal, vdata32, 64 ); rintrlv_4x32_4x64( vdata, vdata32, 640 ); break; case WHIRLPOOL: diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c index 4ad201b..9d2fe34 100644 --- a/algo/x16/x16rv2.c +++ b/algo/x16/x16rv2.c @@ -168,7 +168,7 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce, static __thread uint32_t s_ntime = UINT32_MAX; if ( s_ntime != pdata[17] ) { - uint32_t ntime = swab32(pdata[17]); + uint32_t ntime = bswap_32(pdata[17]); x16_r_s_getAlgoString( (const uint8_t*) (&edata[1]), x16r_hash_order ); s_ntime = ntime; if ( opt_debug && !thr_id ) diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 8769480..a0463e9 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -21,10 +21,10 @@ static __thread uint64_t* x21s_8way_matrix; union _x21s_8way_context_overlay { - haval256_5_8way_context haval; + haval256_8x32_context haval; sph_tiger_context tiger; sph_gost512_context gost; - sha256_8way_context sha256; + sha256_8x32_context sha256; } __attribute__ ((aligned (64))); typedef union _x21s_8way_context_overlay x21s_8way_context_overlay; @@ -50,9 +50,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhash, 64 ); - haval256_5_8way_close( &ctx.haval, vhash ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhash, 64 ); + haval256_8x32_close( &ctx.haval, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -122,9 +122,9 @@ int x21s_8way_hash( void* output, const void* input, int thrid ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - sha256_8way_init( &ctx.sha256 ); - sha256_8way_update( &ctx.sha256, vhash, 64 ); - sha256_8way_close( &ctx.sha256, output ); + sha256_8x32_init( &ctx.sha256 ); + sha256_8x32_update( &ctx.sha256, vhash, 64 ); + sha256_8x32_close( &ctx.sha256, output ); return 1; } @@ -202,11 +202,11 @@ static __thread uint64_t* x21s_4way_matrix; union _x21s_4way_context_overlay { - haval256_5_4way_context haval; + haval256_4x32_context haval; sph_tiger_context tiger; sph_gost512_context gost; #if !defined(__SHA__) - sha256_4way_context sha256; + sha256_4x32_context sha256; #endif } __attribute__ ((aligned (64))); @@ -228,9 +228,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhash, 64 ); - haval256_5_4way_close( &ctx.haval, vhash ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhash, 64 ); + haval256_4x32_close( &ctx.haval, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); @@ -279,9 +279,9 @@ int x21s_4way_hash( void* output, const void* input, int thrid ) #else intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); - sha256_4way_init( &ctx.sha256 ); - sha256_4way_update( &ctx.sha256, vhash, 64 ); - sha256_4way_close( &ctx.sha256, vhash ); + sha256_4x32_init( &ctx.sha256 ); + sha256_4x32_update( &ctx.sha256, vhash, 64 ); + sha256_4x32_close( &ctx.sha256, vhash ); dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 ); #endif diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c index e9957fd..53e09d6 100644 --- a/algo/x16/x21s.c +++ b/algo/x16/x21s.c @@ -78,7 +78,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce, static __thread uint32_t s_ntime = UINT32_MAX; if ( s_ntime != pdata[17] ) { - uint32_t ntime = swab32(pdata[17]); + uint32_t ntime = bswap_32(pdata[17]); x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order ); s_ntime = ntime; if ( opt_debug && !thr_id ) diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index 61d2b60..97c732c 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -31,20 +31,20 @@ union _sonoa_8way_context_overlay { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + blake512_8x64_context blake; + bmw512_8x64_context bmw; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_context cube; simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - haval256_5_8way_context haval; + sha512_8x64_context sha512; + haval256_8x32_context haval; #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; @@ -75,9 +75,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) // 1 - blake512_8way_full( &ctx.blake, vhash, input, 80 ); + blake512_8x64_full( &ctx.blake, vhash, input, 80 ); - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -107,15 +107,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -189,7 +189,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) if ( work_restart[thr_id].restart ) return 0; // 2 - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -219,15 +219,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -298,14 +298,14 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); if ( work_restart[thr_id].restart ) return 0; // 3 - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -335,17 +335,17 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8x64_init( &ctx.skein ); + skein512_8x64_update( &ctx.skein, vhash, 64 ); + skein512_8x64_close( &ctx.skein, vhash ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -416,9 +416,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -438,7 +438,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -468,15 +468,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -547,9 +547,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -566,15 +566,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); rintrlv_8x32_8x64( vhashA, vhash, 512 ); - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhashA, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); #if defined(__VAES__) @@ -633,13 +633,13 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) if ( work_restart[thr_id].restart ) return 0; // 5 - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); rintrlv_8x64_8x32( vhashA, vhash, 512 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhashA, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhashA, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); #if defined(__VAES__) @@ -669,15 +669,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -748,9 +748,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -767,9 +767,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -789,7 +789,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -819,15 +819,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -898,9 +898,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -917,9 +917,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -936,9 +936,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -958,7 +958,7 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -988,15 +988,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -1067,9 +1067,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -1086,9 +1086,9 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -1105,15 +1105,15 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhash ); rintrlv_8x64_8x32( vhashA, vhash, 512 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhashA, 64 ); - haval256_5_8way_close( &ctx.haval, state ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhashA, 64 ); + haval256_8x32_close( &ctx.haval, state ); return 1; } @@ -1122,8 +1122,8 @@ int sonoa_8way_hash( void *state, const void *input, int thr_id ) union _sonoa_4way_context_overlay { - blake512_4way_context blake; - bmw512_4way_context bmw; + blake512_4x64_context blake; + bmw512_4x64_context bmw; #if defined(__VAES__) groestl512_2way_context groestl; echo512_2way_context echo; @@ -1131,19 +1131,19 @@ union _sonoa_4way_context_overlay hashState_groestl groestl; hashState_echo echo; #endif - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; luffa_2way_context luffa; cube_2way_context cube; shavite512_2way_context shavite; simd_2way_context simd; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; + sha512_4x64_context sha512; + haval256_4x32_context haval; }; typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay; @@ -1161,11 +1161,11 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) // 1 - blake512_4way_full( &ctx.blake, vhash, input, 80 ); + blake512_4x64_full( &ctx.blake, vhash, input, 80 ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -1189,15 +1189,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1241,9 +1241,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) if ( work_restart[thr_id].restart ) return 0; // 2 - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -1267,15 +1267,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1316,16 +1316,16 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); if ( work_restart[thr_id].restart ) return 0; // 3 - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -1349,15 +1349,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1398,9 +1398,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1413,9 +1413,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) // 4 intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -1439,15 +1439,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1488,9 +1488,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1501,15 +1501,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); rintrlv_4x32_4x64( vhashB, vhash, 512 ); - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhashB, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhashB, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); #if defined(__VAES__) @@ -1545,15 +1545,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) // 5 rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); rintrlv_4x64_4x32( vhashB, vhash, 512 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhashB, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhashB, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); #if defined(__VAES__) @@ -1580,15 +1580,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1629,9 +1629,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1642,9 +1642,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -1658,9 +1658,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -1684,15 +1684,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1733,9 +1733,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1746,9 +1746,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -1759,9 +1759,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1775,9 +1775,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -1801,15 +1801,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -1850,9 +1850,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -1863,9 +1863,9 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -1876,15 +1876,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhash ); rintrlv_4x64_4x32( vhashB, vhash, 512 ); - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhashB, 64 ); - haval256_5_4way_close( &ctx.haval, state ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhashB, 64 ); + haval256_4x32_close( &ctx.haval, state ); return 1; } diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 8b2b6f8..45be0a5 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -31,11 +31,11 @@ union _x17_16way_context_overlay { - blake512_8way_context blake; + blake512_8x64_context blake; bmw512_8x64_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_2buf_context cube; #if defined(__VAES__) @@ -48,17 +48,17 @@ union _x17_16way_context_overlay hashState_echo echo; #endif simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_16way_context shabal; + shabal512_16x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - haval256_5_16way_context haval; + sha512_8x64_context sha512; + haval256_16x32_context haval; } __attribute__ ((aligned (64))); typedef union _x17_16way_context_overlay x17_16way_context_overlay; static __thread __m512i x17_16way_midstate[16] __attribute__((aligned(64))); -static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64))); +static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64))); int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB, int thr_id ) @@ -85,13 +85,10 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB, uint64_t hash15[8] __attribute__ ((aligned (32))); x17_16way_context_overlay ctx; - - - - memcpy( &ctx.blake, &blake512_8way_ctx, sizeof (blake512_8way_ctx) ); - blake512_8way_final_le( &blake512_8way_ctx, vhashA, nonceA, + memcpy( &ctx.blake, &blake512_8x64_ctx, sizeof (blake512_8x64_ctx) ); + blake512_8x64_final_le( &blake512_8x64_ctx, vhashA, nonceA, x17_16way_midstate ); - blake512_8way_final_le( &ctx.blake, vhashB, nonceB, + blake512_8x64_final_le( &ctx.blake, vhashB, nonceB, x17_16way_midstate ); bmw512_8x64_full( &ctx.bmw, vhashA, vhashA, 64 ); @@ -140,22 +137,22 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB, #endif - skein512_8way_full( &ctx.skein, vhashA, vhashA, 64 ); - skein512_8way_full( &ctx.skein, vhashB, vhashB, 64 ); + skein512_8x64_full( &ctx.skein, vhashA, vhashA, 64 ); + skein512_8x64_full( &ctx.skein, vhashB, vhashB, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhashA, 64 ); - jh512_8way_close( &ctx.jh, vhashA ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhashB, 64 ); - jh512_8way_close( &ctx.jh, vhashB ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhashA, 64 ); + jh512_8x64_close( &ctx.jh, vhashA ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhashB, 64 ); + jh512_8x64_close( &ctx.jh, vhashB ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhashA, 64 ); - keccak512_8way_close( &ctx.keccak, vhashA ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhashB, 64 ); - keccak512_8way_close( &ctx.keccak, vhashB ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhashA, 64 ); + keccak512_8x64_close( &ctx.keccak, vhashA ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhashB, 64 ); + keccak512_8x64_close( &ctx.keccak, vhashB ); // rintrlv_8x64_4x128( vhashC, vhashD, vhashA, 512 ); @@ -310,18 +307,17 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB, */ - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhashA, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhashA ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhashA, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhashA ); dintrlv_8x64_512( hash00, hash01, hash02, hash03, hash04, hash05, hash06, hash07, vhashA ); - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhashB, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhashB ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhashB, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhashB ); dintrlv_8x64_512( hash08, hash09, hash10, hash11, hash12, hash13, hash14, hash15, vhashB ); - fugue512_full( &ctx.fugue, hash00, hash00, 64 ); fugue512_full( &ctx.fugue, hash01, hash01, 64 ); fugue512_full( &ctx.fugue, hash02, hash02, 64 ); @@ -344,9 +340,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB, hash08, hash09, hash10, hash11, hash12, hash13, hash14, hash15 ); - shabal512_16way_init( &ctx.shabal ); - shabal512_16way_update( &ctx.shabal, vhashA, 64 ); - shabal512_16way_close( &ctx.shabal, vhashA ); + shabal512_16x32_init( &ctx.shabal ); + shabal512_16x32_update( &ctx.shabal, vhashA, 64 ); + shabal512_16x32_close( &ctx.shabal, vhashA ); dintrlv_16x32_512( hash00, hash01, hash02, hash03, hash04, hash05, hash06, hash07, @@ -375,12 +371,12 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB, intrlv_8x64_512( vhashB, hash08, hash09, hash10, hash11, hash12, hash13, hash14, hash15 ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhashA, 64 ); - sha512_8way_close( &ctx.sha512, vhashA ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhashB, 64 ); - sha512_8way_close( &ctx.sha512, vhashB ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhashA, 64 ); + sha512_8x64_close( &ctx.sha512, vhashA ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhashB, 64 ); + sha512_8x64_close( &ctx.sha512, vhashB ); dintrlv_8x64_512( hash00, hash01, hash02, hash03, hash04, hash05, hash06, hash07, vhashA ); @@ -391,9 +387,9 @@ int x17_16x64_hash( void *state, const __m512i nonceA, const __m512i nonceB, hash08, hash09, hash10, hash11, hash12, hash13, hash14, hash15 ); - haval256_5_16way_init( &ctx.haval ); - haval256_5_16way_update( &ctx.haval, vhashA, 64 ); - haval256_5_16way_close( &ctx.haval, state ); + haval256_16x32_init( &ctx.haval ); + haval256_16x32_update( &ctx.haval, vhashA, 64 ); + haval256_16x32_close( &ctx.haval, state ); return 1; } @@ -425,7 +421,7 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce, edata[4] = v128_swap64_32( casti_v128u32( pdata, 4 ) ); mm512_intrlv80_8x64( vdata, edata ); - blake512_8way_prehash_le( &blake512_8way_ctx, x17_16way_midstate, vdata ); + blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_16way_midstate, vdata ); nonceA = _mm512_add_epi32( casti_m512i( vdata, 9 ), _mm512_set_epi64( 7, 6, 5, 4, 3, 2, 1, 0 ) ); @@ -456,11 +452,11 @@ int scanhash_x17_16x32( struct work *work, uint32_t max_nonce, union _x17_8way_context_overlay { - blake512_8way_context blake; + blake512_8x64_context blake; bmw512_8x64_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_2buf_context cube; #if defined(__VAES__) @@ -473,17 +469,17 @@ union _x17_8way_context_overlay hashState_echo echo; #endif simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - haval256_5_8way_context haval; + sha512_8x64_context sha512; + haval256_8x32_context haval; } __attribute__ ((aligned (64))); typedef union _x17_8way_context_overlay x17_8way_context_overlay; static __thread __m512i x17_8way_midstate[16] __attribute__((aligned(64))); -static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64))); +static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64))); int x17_8x64_hash( void *state, const void *input, int thr_id ) { @@ -500,7 +496,7 @@ int x17_8x64_hash( void *state, const void *input, int thr_id ) uint64_t hash7[8] __attribute__ ((aligned (32))); x17_8way_context_overlay ctx; - blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ), + blake512_8x64_final_le( &blake512_8x64_ctx, vhash, casti_m512i( input, 9 ), x17_8way_midstate ); bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); @@ -533,15 +529,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 ); @@ -611,9 +607,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -629,9 +625,9 @@ int x17_8x64_hash( void *state, const void *input, int thr_id ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -648,15 +644,15 @@ int x17_8x64_hash( void *state, const void *input, int thr_id ) intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhash ); rintrlv_8x64_8x32( vhashA, vhash, 512 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhashA, 64 ); - haval256_5_8way_close( &ctx.haval, state ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhashA, 64 ); + haval256_8x32_close( &ctx.haval, state ); return 1; } @@ -690,7 +686,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce, mm512_intrlv80_8x64( vdata, edata ); *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32( 0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) ); - blake512_8way_prehash_le( &blake512_8way_ctx, x17_8way_midstate, vdata ); + blake512_8x64_prehash_le( &blake512_8x64_ctx, x17_8way_midstate, vdata ); do { @@ -717,7 +713,7 @@ int scanhash_x17_8x64( struct work *work, uint32_t max_nonce, union _x17_4way_context_overlay { - blake512_4way_context blake; + blake512_4x64_context blake; bmw512_4x64_context bmw; #if defined(__VAES__) groestl512_2way_context groestl; @@ -726,24 +722,24 @@ union _x17_4way_context_overlay hashState_groestl groestl; hashState_echo echo; #endif - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; luffa_2way_context luffa; cube_2way_context cube; shavite512_2way_context shavite; simd_2way_context simd; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; + sha512_4x64_context sha512; + haval256_4x32_context haval; }; typedef union _x17_4way_context_overlay x17_4way_context_overlay; static __thread __m256i x17_4way_midstate[16] __attribute__((aligned(64))); -static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64))); +static __thread blake512_4x64_context blake512_4x64_ctx __attribute__((aligned(64))); int x17_4x64_hash( void *state, const void *input, int thr_id ) { @@ -756,11 +752,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id ) uint64_t hash3[8] __attribute__ ((aligned (32))); x17_4way_context_overlay ctx; - blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ), + blake512_4x64_final_le( &blake512_4x64_ctx, vhash, casti_m256i( input, 9 ), x17_4way_midstate ); -// blake512_4way_full( &ctx.blake, vhash, input, 80 ); - bmw512_4x64_init( &ctx.bmw ); bmw512_4x64_update( &ctx.bmw, vhash, 64 ); bmw512_4x64_close( &ctx.bmw, vhash ); @@ -789,13 +783,13 @@ int x17_4x64_hash( void *state, const void *input, int thr_id ) skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -836,9 +830,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); @@ -849,9 +843,9 @@ int x17_4x64_hash( void *state, const void *input, int thr_id ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); @@ -862,15 +856,15 @@ int x17_4x64_hash( void *state, const void *input, int thr_id ) intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhash ); rintrlv_4x64_4x32( vhashB, vhash, 512 ); - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhashB, 64 ); - haval256_5_4way_close( &ctx.haval, state ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhashB, 64 ); + haval256_4x32_close( &ctx.haval, state ); return 1; } @@ -903,7 +897,7 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce, mm256_intrlv80_4x64( vdata, edata ); *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( 0,3,0,2, 0,1,0,0 ) ); - blake512_4way_prehash_le( &blake512_4way_ctx, x17_4way_midstate, vdata ); + blake512_4x64_prehash_le( &blake512_4x64_ctx, x17_4way_midstate, vdata ); do { diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h index 6bc030c..63fe1af 100644 --- a/algo/x17/x17-gate.h +++ b/algo/x17/x17-gate.h @@ -6,10 +6,8 @@ #if defined(SIMD512) #define X17_8WAY 1 -// #define X17_16X32 1 #elif defined(__AVX2__) && defined(__AES__) #define X17_4WAY 1 - #define X17_8X32 1 #elif defined(__SSE2__) || defined(__ARM_NEON) #define X17_2X64 1 #endif diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index e958c16..e6db42e 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -31,20 +31,20 @@ union _xevan_8way_context_overlay { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + blake512_8x64_context blake; + bmw512_8x64_context bmw; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_context cube; simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - haval256_5_8way_context haval; + sha512_8x64_context sha512; + haval256_8x32_context haval; #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; @@ -73,10 +73,10 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) const int dataLen = 128; xevan_8way_context_overlay ctx __attribute__ ((aligned (64))); - blake512_8way_full( &ctx.blake, vhash, input, 80 ); + blake512_8x64_full( &ctx.blake, vhash, input, 80 ); memset( &vhash[8<<3], 0, 64<<3 ); - bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen ); #if defined(__VAES__) @@ -106,15 +106,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, dataLen ); + skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, dataLen ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, dataLen ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, dataLen ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, dataLen ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); @@ -185,9 +185,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); @@ -204,9 +204,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, dataLen ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, dataLen ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); @@ -223,23 +223,23 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, dataLen ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, dataLen ); + sha512_8x64_close( &ctx.sha512, vhash ); rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhashA, dataLen ); - haval256_5_8way_close( &ctx.haval, vhashA ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhashA, dataLen ); + haval256_8x32_close( &ctx.haval, vhashA ); rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 ); memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 ); - blake512_8way_full( &ctx.blake, vhash, vhash, dataLen ); + blake512_8x64_full( &ctx.blake, vhash, vhash, dataLen ); - bmw512_8way_full( &ctx.bmw, vhash, vhash, dataLen ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, dataLen ); #if defined(__VAES__) @@ -269,15 +269,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, dataLen ); + skein512_8x64_full( &ctx.skein, vhash, vhash, dataLen ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, dataLen ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, dataLen ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, dataLen ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, dataLen ); + keccak512_8x64_close( &ctx.keccak, vhash ); rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 ); @@ -348,9 +348,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) #endif - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, dataLen ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); @@ -367,9 +367,9 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, dataLen ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, dataLen ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash, dataLen<<3 ); @@ -386,15 +386,15 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, dataLen ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, dataLen ); + sha512_8x64_close( &ctx.sha512, vhash ); rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhashA, dataLen ); - haval256_5_8way_close( &ctx.haval, output ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhashA, dataLen ); + haval256_8x32_close( &ctx.haval, output ); return 1; } @@ -403,28 +403,28 @@ int xevan_8way_hash( void *output, const void *input, int thr_id ) union _xevan_4way_context_overlay { - blake512_4way_context blake; - bmw512_4way_context bmw; + blake512_4x64_context blake; + bmw512_4x64_context bmw; #if defined(__VAES__) groestl512_2way_context groestl; echo_2way_context echo; #else - hashState_groestl groestl; + hashState_groestl groestl; hashState_echo echo; #endif - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; luffa_2way_context luffa; cube_2way_context cube; shavite512_2way_context shavite; simd_2way_context simd; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; + sha512_4x64_context sha512; + haval256_4x32_context haval; }; typedef union _xevan_4way_context_overlay xevan_4way_context_overlay; @@ -440,12 +440,12 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) const int dataLen = 128; xevan_4way_context_overlay ctx __attribute__ ((aligned (64))); - blake512_4way_full( &ctx.blake, vhash, input, 80 ); + blake512_4x64_full( &ctx.blake, vhash, input, 80 ); memset( &vhash[8<<2], 0, 64<<2 ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, dataLen ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, dataLen ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -469,15 +469,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, dataLen ); + skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, dataLen ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, dataLen ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, dataLen ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, dataLen ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); @@ -518,9 +518,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, dataLen ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -532,9 +532,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) // Parallel 4way 32 bit intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, dataLen ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, dataLen ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -546,27 +546,27 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, dataLen ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, dataLen ); + sha512_4x64_close( &ctx.sha512, vhash ); rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 ); - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhashA, dataLen ); - haval256_5_4way_close( &ctx.haval, vhashA ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhashA, dataLen ); + haval256_4x32_close( &ctx.haval, vhashA ); rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 ); memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 ); - blake512_4way_init( &ctx.blake ); - blake512_4way_update( &ctx.blake, vhash, dataLen ); - blake512_4way_close(&ctx.blake, vhash); + blake512_4x64_init( &ctx.blake ); + blake512_4x64_update( &ctx.blake, vhash, dataLen ); + blake512_4x64_close(&ctx.blake, vhash); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, dataLen ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, dataLen ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -590,15 +590,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, dataLen ); + skein512_4x64_full( &ctx.skein, vhash, vhash, dataLen ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, dataLen ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, dataLen ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, dataLen ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, dataLen ); + keccak512_4x64_close( &ctx.keccak, vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 ); @@ -639,9 +639,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) #endif - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, dataLen ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, dataLen ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -652,9 +652,9 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, dataLen ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, dataLen ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); @@ -665,15 +665,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id ) intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, dataLen ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, dataLen ); + sha512_4x64_close( &ctx.sha512, vhash ); rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 ); - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhashA, dataLen ); - haval256_5_4way_close( &ctx.haval, output ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhashA, dataLen ); + haval256_4x32_close( &ctx.haval, output ); return 1; } diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c index 35a93de..23fea8d 100644 --- a/algo/x22/x22i-4way.c +++ b/algo/x22/x22i-4way.c @@ -32,24 +32,24 @@ union _x22i_8way_ctx_overlay { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + blake512_8x64_context blake; + bmw512_8x64_context bmw; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_context cube; simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - haval256_5_8way_context haval; + sha512_8x64_context sha512; + haval256_8x32_context haval; sph_tiger_context tiger; sph_gost512_context gost; #if !defined(X22I_8WAY_SHA) - sha256_8way_context sha256; + sha256_8x32_context sha256; #endif #if defined(__VAES__) groestl512_4way_context groestl; @@ -88,9 +88,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) unsigned char hashA7[64] __attribute__((aligned(32))) = {0}; x22i_8way_ctx_overlay ctx; - blake512_8way_full( &ctx.blake, vhash, input, 80 ); + blake512_8x64_full( &ctx.blake, vhash, input, 80 ); - bmw512_8way_full( &ctx.bmw, vhash, vhash, 64 ); + bmw512_8x64_full( &ctx.bmw, vhash, vhash, 64 ); #if defined(__VAES__) @@ -120,15 +120,15 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) #endif - skein512_8way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); if ( work_restart[thrid].restart ) return 0; @@ -219,9 +219,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) if ( work_restart[thrid].restart ) return 0; - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -238,9 +238,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], &hash4[8], &hash5[8], &hash6[8], &hash7[8], vhash ); @@ -273,9 +273,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) intrlv_8x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16], &hash4[16], &hash5[16], &hash6[16], &hash7[16] ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhash ); dintrlv_8x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], &hash4[24], &hash5[24], &hash6[24], &hash7[24], vhash ); @@ -294,9 +294,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) memset( vhash, 0, 64*8 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhashA, 64 ); - haval256_5_8way_close( &ctx.haval, vhash ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhashA, 64 ); + haval256_8x32_close( &ctx.haval, vhash ); dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, vhash ); @@ -400,9 +400,9 @@ int x22i_8way_hash( void *output, const void *input, int thrid ) intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7 ); - sha256_8way_init( &ctx.sha256 ); - sha256_8way_update( &ctx.sha256, vhash, 64 ); - sha256_8way_close( &ctx.sha256, output ); + sha256_8x32_init( &ctx.sha256 ); + sha256_8x32_update( &ctx.sha256, vhash, 64 ); + sha256_8x32_close( &ctx.sha256, output ); #endif @@ -427,8 +427,6 @@ int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x08ff; - InitializeSWIFFTX(); - mm512_bswap32_intrlv80_8x64( vdata, pdata ); *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, @@ -472,8 +470,6 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x08ff; - InitializeSWIFFTX(); - mm512_bswap32_intrlv80_8x64( vdata, pdata ); *noncev = mm512_intrlv_blend_32( _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0, @@ -506,8 +502,8 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce, union _x22i_4way_ctx_overlay { - blake512_4way_context blake; - bmw512_4way_context bmw; + blake512_4x64_context blake; + bmw512_4x64_context bmw; #if defined(__VAES__) groestl512_2way_context groestl; echo_2way_context echo; @@ -516,22 +512,22 @@ union _x22i_4way_ctx_overlay hashState_echo echo; #endif shavite512_2way_context shavite; - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; luffa_2way_context luffa; cube_2way_context cube; simd_2way_context simd; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; + sha512_4x64_context sha512; + haval256_4x32_context haval; sph_tiger_context tiger; sph_gost512_context gost; #if !defined(X22I_4WAY_SHA) - sha256_4way_context sha256; + sha256_4x32_context sha256; #endif }; typedef union _x22i_4way_ctx_overlay x22i_ctx_overlay; @@ -551,11 +547,11 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) unsigned char hashA3[64] __attribute__((aligned(32))) = {0}; x22i_ctx_overlay ctx; - blake512_4way_full( &ctx.blake, vhash, input, 80 ); + blake512_4x64_full( &ctx.blake, vhash, input, 80 ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); #if defined(__VAES__) @@ -579,15 +575,15 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) #endif - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); if ( work_restart[thrid].restart ) return false; @@ -632,9 +628,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) if ( work_restart[thrid].restart ) return false; - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash ); fugue512_full( &ctx.fugue, hash0, hash0, 64 ); @@ -644,9 +640,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], vhash ); sph_whirlpool_init( &ctx.whirlpool ); @@ -664,9 +660,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) intrlv_4x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16] ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhash ); dintrlv_4x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash ); if ( work_restart[thrid].restart ) return false; @@ -680,9 +676,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) memset( vhash, 0, 64*4 ); - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhashA, 64 ); - haval256_5_4way_close( &ctx.haval, vhash ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhashA, 64 ); + haval256_4x32_close( &ctx.haval, vhash ); dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash ); memset( hashA0, 0, 64 ); @@ -743,9 +739,9 @@ int x22i_4way_hash( void *output, const void *input, int thrid ) intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 ); - sha256_4way_init( &ctx.sha256 ); - sha256_4way_update( &ctx.sha256, vhash, 64 ); - sha256_4way_close( &ctx.sha256, output ); + sha256_4x32_init( &ctx.sha256 ); + sha256_4x32_update( &ctx.sha256, vhash, 64 ); + sha256_4x32_close( &ctx.sha256, output ); #endif @@ -770,8 +766,6 @@ int scanhash_x22i_4way_sha( struct work* work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x08ff; - InitializeSWIFFTX(); - mm256_bswap32_intrlv80_4x64( vdata, pdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); @@ -814,8 +808,6 @@ int scanhash_x22i_4way( struct work* work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x08ff; - InitializeSWIFFTX(); - mm256_bswap32_intrlv80_4x64( vdata, pdata ); *noncev = mm256_intrlv_blend_32( _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c index 755f61d..c377de0 100644 --- a/algo/x22/x22i-gate.c +++ b/algo/x22/x22i-gate.c @@ -33,6 +33,7 @@ bool register_x22i_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT | AVX512_OPT | VAES_OPT | NEON_OPT; + InitializeSWIFFTX(); return true; }; diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index b223555..77faa77 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -63,29 +63,29 @@ void x25x_shuffle( void *hash ) union _x25x_8way_ctx_overlay { - blake512_8way_context blake; - bmw512_8way_context bmw; - skein512_8way_context skein; - jh512_8way_context jh; - keccak512_8way_context keccak; + blake512_8x64_context blake; + bmw512_8x64_context bmw; + skein512_8x64_context skein; + jh512_8x64_context jh; + keccak512_8x64_context keccak; luffa_4way_context luffa; cube_4way_context cube; simd_4way_context simd; - hamsi512_8way_context hamsi; + hamsi512_8x64_context hamsi; hashState_fugue fugue; - shabal512_8way_context shabal; + shabal512_8x32_context shabal; sph_whirlpool_context whirlpool; - sha512_8way_context sha512; - haval256_5_8way_context haval; + sha512_8x64_context sha512; + haval256_8x32_context haval; sph_tiger_context tiger; sph_gost512_context gost; #if defined(X25X_8WAY_SHA) sha256_context sha256; #else - sha256_8way_context sha256; + sha256_8x32_context sha256; #endif - panama_8way_context panama; - blake2s_8way_state blake2s; + panama_8x32_context panama; + blake2s_8x32_state blake2s; #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; @@ -99,7 +99,7 @@ union _x25x_8way_ctx_overlay typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay; static __thread __m512i x25x_8way_midstate[16] __attribute__((aligned(64))); -static __thread blake512_8way_context blake512_8way_ctx __attribute__((aligned(64))); +static __thread blake512_8x64_context blake512_8x64_ctx __attribute__((aligned(64))); int x25x_8way_hash( void *output, const void *input, int thrid ) { @@ -117,15 +117,15 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) uint64_t vhashB[8*8] __attribute__ ((aligned (64))); x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64))); - blake512_8way_final_le( &blake512_8way_ctx, vhash, casti_m512i( input, 9 ), + blake512_8x64_final_le( &blake512_8x64_ctx, vhash, casti_m512i( input, 9 ), x25x_8way_midstate ); dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0], hash4[0], hash5[0], hash6[0], hash7[0], vhash ); - bmw512_8way_init( &ctx.bmw ); - bmw512_8way_update( &ctx.bmw, vhash, 64 ); - bmw512_8way_close( &ctx.bmw, vhash ); + bmw512_8x64_init( &ctx.bmw ); + bmw512_8x64_update( &ctx.bmw, vhash, 64 ); + bmw512_8x64_close( &ctx.bmw, vhash ); dintrlv_8x64_512( hash0[1], hash1[1], hash2[1], hash3[1], hash4[1], hash5[1], hash6[1], hash7[1], vhash ); @@ -175,21 +175,19 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) #endif - skein512_8way_init( &ctx.skein ); - skein512_8way_update( &ctx.skein, vhash, 64 ); - skein512_8way_close( &ctx.skein, vhash ); + skein512_8x64_full( &ctx.skein, vhash, vhash, 64 ); dintrlv_8x64_512( hash0[3], hash1[3], hash2[3], hash3[3], hash4[3], hash5[3], hash6[3], hash7[3], vhash ); - jh512_8way_init( &ctx.jh ); - jh512_8way_update( &ctx.jh, vhash, 64 ); - jh512_8way_close( &ctx.jh, vhash ); + jh512_8x64_init( &ctx.jh ); + jh512_8x64_update( &ctx.jh, vhash, 64 ); + jh512_8x64_close( &ctx.jh, vhash ); dintrlv_8x64_512( hash0[4], hash1[4], hash2[4], hash3[4], hash4[4], hash5[4], hash6[4], hash7[4], vhash ); - keccak512_8way_init( &ctx.keccak ); - keccak512_8way_update( &ctx.keccak, vhash, 64 ); - keccak512_8way_close( &ctx.keccak, vhash ); + keccak512_8x64_init( &ctx.keccak ); + keccak512_8x64_update( &ctx.keccak, vhash, 64 ); + keccak512_8x64_close( &ctx.keccak, vhash ); dintrlv_8x64_512( hash0[5], hash1[5], hash2[5], hash3[5], hash4[5], hash5[5], hash6[5], hash7[5], vhash ); @@ -303,9 +301,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) if ( work_restart[thrid].restart ) return 0; - hamsi512_8way_init( &ctx.hamsi ); - hamsi512_8way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_8way_close( &ctx.hamsi, vhash ); + hamsi512_8x64_init( &ctx.hamsi ); + hamsi512_8x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_8x64_close( &ctx.hamsi, vhash ); dintrlv_8x64_512( hash0[11], hash1[11], hash2[11], hash3[11], hash4[11], hash5[11], hash6[11], hash7[11], vhash ); @@ -321,9 +319,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) intrlv_8x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12], hash4[12], hash5[12], hash6[12], hash7[12] ); - shabal512_8way_init( &ctx.shabal ); - shabal512_8way_update( &ctx.shabal, vhash, 64 ); - shabal512_8way_close( &ctx.shabal, vhash ); + shabal512_8x32_init( &ctx.shabal ); + shabal512_8x32_update( &ctx.shabal, vhash, 64 ); + shabal512_8x32_close( &ctx.shabal, vhash ); dintrlv_8x32_512( hash0[13], hash1[13], hash2[13], hash3[13], hash4[13], hash5[13], hash6[13], hash7[13], vhash ); @@ -354,9 +352,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) intrlv_8x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14], hash4[14], hash5[14], hash6[14], hash7[14] ); - sha512_8way_init( &ctx.sha512 ); - sha512_8way_update( &ctx.sha512, vhash, 64 ); - sha512_8way_close( &ctx.sha512, vhash ); + sha512_8x64_init( &ctx.sha512 ); + sha512_8x64_update( &ctx.sha512, vhash, 64 ); + sha512_8x64_close( &ctx.sha512, vhash ); dintrlv_8x64_512( hash0[15], hash1[15], hash2[15], hash3[15], hash4[15], hash5[15], hash6[15], hash7[15], vhash ); @@ -372,9 +370,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) hash4[16], hash5[16], hash6[16], hash7[16] ); memset( vhash, 0, 64*8 ); - haval256_5_8way_init( &ctx.haval ); - haval256_5_8way_update( &ctx.haval, vhashA, 64 ); - haval256_5_8way_close( &ctx.haval, vhash ); + haval256_8x32_init( &ctx.haval ); + haval256_8x32_update( &ctx.haval, vhashA, 64 ); + haval256_8x32_close( &ctx.haval, vhash ); dintrlv_8x32_512( hash0[17], hash1[17], hash2[17], hash3[17], hash4[17], hash5[17], hash6[17], hash7[17], vhash ); @@ -462,17 +460,17 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) intrlv_8x32_512( vhashA, hash0[20], hash1[20], hash2[20], hash3[20], hash4[20], hash5[20], hash6[20], hash7[20] ); - sha256_8way_init( &ctx.sha256 ); - sha256_8way_update( &ctx.sha256, vhashA, 64 ); - sha256_8way_close( &ctx.sha256, vhash ); + sha256_8x32_init( &ctx.sha256 ); + sha256_8x32_update( &ctx.sha256, vhashA, 64 ); + sha256_8x32_close( &ctx.sha256, vhash ); dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21], hash4[21], hash5[21], hash6[21], hash7[21], vhash ); #endif - panama_8way_init( &ctx.panama ); - panama_8way_update( &ctx.panama, vhash, 64 ); - panama_8way_close( &ctx.panama, vhash ); + panama_8x32_init( &ctx.panama ); + panama_8x32_update( &ctx.panama, vhash, 64 ); + panama_8x32_close( &ctx.panama, vhash ); dintrlv_8x32_512( hash0[22], hash1[22], hash2[22], hash3[22], hash4[22], hash5[22], hash6[22], hash7[22], vhash ); @@ -545,8 +543,8 @@ int x25x_8way_hash( void *output, const void *input, int thrid ) intrlv_8x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23], hash4[23], hash5[23], hash6[23], hash7[23] ); - blake2s_8way_init( &ctx.blake2s, 32 ); - blake2s_8way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 ); + blake2s_8x32_init( &ctx.blake2s, 32 ); + blake2s_8x32_full_blocks( &ctx.blake2s, output, vhashX, 64*24 ); return 1; } @@ -578,14 +576,13 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce, edata[4] = v128_swap64_32( casti_v128( pdata, 4 ) ); mm512_intrlv80_8x64( vdata, edata ); - *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32( - 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0 ) ); - blake512_8way_prehash_le( &blake512_8way_ctx, x25x_8way_midstate, vdata ); + *noncev = _mm512_add_epi32( *noncev, _mm512_set_epi64( + 7, 6, 5, 4, 3, 2, 1, 0 ) ); + blake512_8x64_prehash_le( &blake512_8x64_ctx, x25x_8way_midstate, vdata ); do { - if ( x25x_8way_hash( hash, vdata, thr_id ) ); - + if ( x25x_8way_hash( hash, vdata, thr_id ) ) for ( int lane = 0; lane < 8; lane++ ) if ( unlikely( ( hashd7[ lane ] <= targ32 ) && !bench ) ) { @@ -608,8 +605,8 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce, union _x25x_4way_ctx_overlay { - blake512_4way_context blake; - bmw512_4way_context bmw; + blake512_4x64_context blake; + bmw512_4x64_context bmw; #if defined(__VAES__) groestl512_2way_context groestl; echo_2way_context echo; @@ -617,34 +614,34 @@ union _x25x_4way_ctx_overlay hashState_groestl groestl; hashState_echo echo; #endif - skein512_4way_context skein; - jh512_4way_context jh; - keccak512_4way_context keccak; + skein512_4x64_context skein; + jh512_4x64_context jh; + keccak512_4x64_context keccak; luffa_2way_context luffa; cube_2way_context cube; shavite512_2way_context shavite; simd_2way_context simd; - hamsi512_4way_context hamsi; + hamsi512_4x64_context hamsi; hashState_fugue fugue; - shabal512_4way_context shabal; + shabal512_4x32_context shabal; sph_whirlpool_context whirlpool; - sha512_4way_context sha512; - haval256_5_4way_context haval; + sha512_4x64_context sha512; + haval256_4x32_context haval; sph_tiger_context tiger; sph_gost512_context gost; #if defined(X25X_4WAY_SHA) sha256_context sha256; #else - sha256_4way_context sha256; + sha256_4x32_context sha256; #endif - panama_4way_context panama; - blake2s_4way_state blake2s; + panama_4x32_context panama; + blake2s_4x32_state blake2s; }; typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay; static __thread __m256i x25x_4way_midstate[16] __attribute__((aligned(64))); -static __thread blake512_4way_context blake512_4way_ctx __attribute__((aligned(64))); +static __thread blake512_4x64_context blake512_4x64_ctx __attribute__((aligned(64))); int x25x_4way_hash( void *output, const void *input, int thrid ) { @@ -658,14 +655,14 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) uint64_t vhashB[8*4] __attribute__ ((aligned (64))); x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64))); - blake512_4way_final_le( &blake512_4way_ctx, vhash, casti_m256i( input, 9 ), + blake512_4x64_final_le( &blake512_4x64_ctx, vhash, casti_m256i( input, 9 ), x25x_4way_midstate ); dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash ); - bmw512_4way_init( &ctx.bmw ); - bmw512_4way_update( &ctx.bmw, vhash, 64 ); - bmw512_4way_close( &ctx.bmw, vhash ); + bmw512_4x64_init( &ctx.bmw ); + bmw512_4x64_update( &ctx.bmw, vhash, 64 ); + bmw512_4x64_close( &ctx.bmw, vhash ); dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash ); #if defined(__VAES__) @@ -688,19 +685,19 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) #endif intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] ); - skein512_4way_full( &ctx.skein, vhash, vhash, 64 ); + skein512_4x64_full( &ctx.skein, vhash, vhash, 64 ); dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash ); - jh512_4way_init( &ctx.jh ); - jh512_4way_update( &ctx.jh, vhash, 64 ); - jh512_4way_close( &ctx.jh, vhash ); + jh512_4x64_init( &ctx.jh ); + jh512_4x64_update( &ctx.jh, vhash, 64 ); + jh512_4x64_close( &ctx.jh, vhash ); dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash ); if ( work_restart[thrid].restart ) return 0; - keccak512_4way_init( &ctx.keccak ); - keccak512_4way_update( &ctx.keccak, vhash, 64 ); - keccak512_4way_close( &ctx.keccak, vhash ); + keccak512_4x64_init( &ctx.keccak ); + keccak512_4x64_update( &ctx.keccak, vhash, 64 ); + keccak512_4x64_close( &ctx.keccak, vhash ); dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash ); rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); @@ -751,9 +748,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) if ( work_restart[thrid].restart ) return 0; - hamsi512_4way_init( &ctx.hamsi ); - hamsi512_4way_update( &ctx.hamsi, vhash, 64 ); - hamsi512_4way_close( &ctx.hamsi, vhash ); + hamsi512_4x64_init( &ctx.hamsi ); + hamsi512_4x64_update( &ctx.hamsi, vhash, 64 ); + hamsi512_4x64_close( &ctx.hamsi, vhash ); dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash ); fugue512_full( &ctx.fugue, hash0[12], hash0[11], 64 ); @@ -763,9 +760,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] ); - shabal512_4way_init( &ctx.shabal ); - shabal512_4way_update( &ctx.shabal, vhash, 64 ); - shabal512_4way_close( &ctx.shabal, vhash ); + shabal512_4x32_init( &ctx.shabal ); + shabal512_4x32_update( &ctx.shabal, vhash, 64 ); + shabal512_4x32_close( &ctx.shabal, vhash ); dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash ); sph_whirlpool_init(&ctx.whirlpool); @@ -783,9 +780,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] ); - sha512_4way_init( &ctx.sha512 ); - sha512_4way_update( &ctx.sha512, vhash, 64 ); - sha512_4way_close( &ctx.sha512, vhash ); + sha512_4x64_init( &ctx.sha512 ); + sha512_4x64_update( &ctx.sha512, vhash, 64 ); + sha512_4x64_close( &ctx.sha512, vhash ); dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash ); ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]); @@ -797,9 +794,9 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) memset( vhash, 0, 64*4 ); - haval256_5_4way_init( &ctx.haval ); - haval256_5_4way_update( &ctx.haval, vhashX[0], 64 ); - haval256_5_4way_close( &ctx.haval, vhash ); + haval256_4x32_init( &ctx.haval ); + haval256_4x32_update( &ctx.haval, vhashX[0], 64 ); + haval256_4x32_close( &ctx.haval, vhash ); dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash ); sph_tiger_init(&ctx.tiger); @@ -853,16 +850,16 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) intrlv_4x32_512( vhashX[0], hash0[20], hash1[20], hash2[20], hash3[20] ); memset( vhash, 0, 64*4 ); - sha256_4way_init( &ctx.sha256 ); - sha256_4way_update( &ctx.sha256, vhashX[0], 64 ); - sha256_4way_close( &ctx.sha256, vhash ); + sha256_4x32_init( &ctx.sha256 ); + sha256_4x32_update( &ctx.sha256, vhashX[0], 64 ); + sha256_4x32_close( &ctx.sha256, vhash ); dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash ); #endif - panama_4way_init( &ctx.panama ); - panama_4way_update( &ctx.panama, vhash, 64 ); - panama_4way_close( &ctx.panama, vhash ); + panama_4x32_init( &ctx.panama ); + panama_4x32_update( &ctx.panama, vhash, 64 ); + panama_4x32_close( &ctx.panama, vhash ); dintrlv_4x32_512( hash0[22], hash1[22], hash2[22], hash3[22], vhash ); laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]); @@ -902,8 +899,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid ) intrlv_4x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22] ); intrlv_4x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23] ); - blake2s_4way_init( &ctx.blake2s, 32 ); - blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 ); + blake2s_4x32_init( &ctx.blake2s, 32 ); + blake2s_4x32_full_blocks( &ctx.blake2s, output, vhashX, 64*24 ); return 1; } @@ -936,9 +933,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce, edata[4] = v128_swap64_32( casti_v128( pdata, 4 ) ); mm256_intrlv80_4x64( vdata, edata ); - *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi32( - 0, 3, 0, 2, 0, 1, 0, 0 ) ); - blake512_4way_prehash_le( &blake512_4way_ctx, x25x_4way_midstate, vdata ); + *noncev = _mm256_add_epi32( *noncev, _mm256_set_epi64x( 3, 2, 1, 0 ) ); + blake512_4x64_prehash_le( &blake512_4x64_ctx, x25x_4way_midstate, vdata ); do { diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c index fb7e5e4..33391f7 100644 --- a/algo/x22/x25x.c +++ b/algo/x22/x25x.c @@ -231,7 +231,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce, do { edata[19] = n; - if ( x25x_hash( hash64, edata, thr_id ) ); + if ( x25x_hash( hash64, edata, thr_id ) ) if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) ) { pdata[19] = bswap_32( n ); diff --git a/algo/yespower/yespower-4way.c b/algo/yespower/yespower-4way.c deleted file mode 100644 index 8f38e90..0000000 --- a/algo/yespower/yespower-4way.c +++ /dev/null @@ -1,692 +0,0 @@ -/*- - * Copyright 2009 Colin Percival - * Copyright 2013-2018 Alexander Peslyak - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - * - * This is a proof-of-work focused fork of yescrypt, including reference and - * cut-down implementation of the obsolete yescrypt 0.5 (based off its first - * submission to PHC back in 2014) and a new proof-of-work specific variation - * known as yespower 1.0. The former is intended as an upgrade for - * cryptocurrencies that already use yescrypt 0.5 and the latter may be used - * as a further upgrade (hard fork) by those and other cryptocurrencies. The - * version of algorithm to use is requested through parameters, allowing for - * both algorithms to co-exist in client and miner implementations (such as in - * preparation for a hard-fork). - * - * This is the reference implementation. Its purpose is to provide a simple - * human- and machine-readable specification that implementations intended - * for actual use should be tested against. It is deliberately mostly not - * optimized, and it is not meant to be used in production. Instead, use - * yespower-opt.c. - */ -/* -#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose." -*/ -#include -#include -#include -#include - -#include "algo/sha/hmac-sha256-hash-4way.h" -//#include "sysendian.h" - -#include "yespower.h" - - -#if defined(__AVX2__) - - -static void blkcpy_8way( __m256i *dst, const __m256i *src, size_t count ) -{ - do { - *dst++ = *src++; - } while (--count); -} - -static void blkxor_8way( __m256i *dst, const __m256i *src, size_t count ) -{ - do { - *dst++ ^= *src++; - } while (--count); -} - -/** - * salsa20(B): - * Apply the Salsa20 core to the provided block. - */ -static void salsa20_8way( __m256i B[16], uint32_t rounds ) -{ - __m256i x[16]; - size_t i; - - /* SIMD unshuffle */ - for ( i = 0; i < 16; i++ ) - x[i * 5 % 16] = B[i]; - - for ( i = 0; i < rounds; i += 2 ) - { -#define R( a, b, c ) mm256_rol_32( _mm256_add_epi32( a, b ), c ) - /* Operate on columns */ - - x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 0], x[12], 7 ) ); - x[ 8] = _mm256_xor_si256( x[ 8], R( x[ 4], x[ 0], 9 ) ); - x[12] = _mm256_xor_si256( x[12], R( x[ 8], x[ 4], 13 ) ); - x[ 0] = _mm256_xor_si256( x[ 0], R( x[12], x[ 8], 18 ) ); - - x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 5], x[ 1], 7 ) ); - x[13] = _mm256_xor_si256( x[13], R( x[ 9], x[ 5], 9 ) ); - x[ 1] = _mm256_xor_si256( x[ 1], R( x[13], x[ 9], 13 ) ); - x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 1], x[13], 18 ) ); - - x[14] = _mm256_xor_si256( x[14], R( x[10], x[ 6], 7 ) ); - x[ 2] = _mm256_xor_si256( x[ 2], R( x[14], x[10], 9 ) ); - x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 2], x[14], 13 ) ); - x[10] = _mm256_xor_si256( x[10], R( x[ 6], x[ 2], 18 ) ); - - x[ 3] = _mm256_xor_si256( x[ 3], R( x[15], x[11], 7 ) ); - x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 3], x[15], 9 ) ); - x[11] = _mm256_xor_si256( x[11], R( x[ 7], x[ 3], 13 ) ); - x[15] = _mm256_xor_si256( x[15], R( x[11], x[ 7], 18 ) ); - - /* Operate on rows */ - - x[ 1] = _mm256_xor_si256( x[ 1], R( x[ 0], x[ 3], 7 ) ); - x[ 2] = _mm256_xor_si256( x[ 2], R( x[ 1], x[ 0], 9 ) ); - x[ 3] = _mm256_xor_si256( x[ 3], R( x[ 2], x[ 1], 13 ) ); - x[ 0] = _mm256_xor_si256( x[ 0], R( x[ 3], x[ 2], 18 ) ); - - x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 5], x[ 4], 7 ) ); - x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 6], x[ 5], 9 ) ); - x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 7], x[ 6], 13 ) ); - x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 4], x[ 7], 18 ) ); - - x[11] = _mm256_xor_si256( x[11], R( x[10], x[ 9], 7 ) ); - x[ 8] = _mm256_xor_si256( x[ 8], R( x[11], x[10], 9 ) ); - x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 8], x[11], 13 ) ); - x[10] = _mm256_xor_si256( x[10], R( x[ 9], x[ 8], 18 ) ); - - x[12] = _mm256_xor_si256( x[12], R( x[15], x[14], 7 ) ); - x[13] = _mm256_xor_si256( x[13], R( x[12], x[15], 9 ) ); - x[14] = _mm256_xor_si256( x[14], R( x[13], x[12], 13 ) ); - x[15] = _mm256_xor_si256( x[15], R( x[14], x[13], 18 ) ); - -#undef R - } - - /* SIMD shuffle */ - for (i = 0; i < 16; i++) - B[i] = _mm256_add_epi32( B[i], x[i * 5 % 16] ); -} - -/** - * blockmix_salsa(B): - * Compute B = BlockMix_{salsa20, 1}(B). The input B must be 128 bytes in - * length. - */ -static void blockmix_salsa_8way( __m256i *B, uint32_t rounds ) -{ - __m256i X[16]; - size_t i; - - /* 1: X <-- B_{2r - 1} */ - blkcpy_8way( X, &B[16], 16 ); - - /* 2: for i = 0 to 2r - 1 do */ - for ( i = 0; i < 2; i++ ) - { - /* 3: X <-- H(X xor B_i) */ - blkxor_8way( X, &B[i * 16], 16 ); - salsa20_8way( X, rounds ); - - /* 4: Y_i <-- X */ - /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - blkcpy_8way( &B[i * 16], X, 16 ); - } -} - -/* - * These are tunable, but they must meet certain constraints and are part of - * what defines a yespower version. - */ -#define PWXsimple 2 -#define PWXgather 4 -/* Version 0.5 */ -#define PWXrounds_0_5 6 -#define Swidth_0_5 8 -/* Version 1.0 */ -#define PWXrounds_1_0 3 -#define Swidth_1_0 11 - -/* Derived values. Not tunable on their own. */ -#define PWXbytes (PWXgather * PWXsimple * 8) -#define PWXwords (PWXbytes / sizeof(uint32_t)) -#define rmin ((PWXbytes + 127) / 128) - -/* Runtime derived values. Not tunable on their own. */ -#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8) -#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8) - -typedef struct { - __m256i (*S0)[2], (*S1)[2], (*S2)[2]; - __m256i *S; - yespower_version_t version; - uint32_t salsa20_rounds; - uint32_t PWXrounds, Swidth, Sbytes, Smask; - size_t w; -} pwxform_8way_ctx_t __attribute__ ((aligned (128))); - -/** - * pwxform(B): - * Transform the provided block using the provided S-boxes. - */ -static void pwxform_8way( __m256i *B, pwxform_8way_ctx_t *ctx ) -{ - __m256i (*X)[PWXsimple][2] = (__m256i (*)[PWXsimple][2])B; - __m256i (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2; - __m256i Smask = _mm256_set1_epi32( ctx->Smask ); - size_t w = ctx->w; - size_t i, j, k; - - /* 1: for i = 0 to PWXrounds - 1 do */ - for ( i = 0; i < ctx->PWXrounds; i++ ) - { - /* 2: for j = 0 to PWXgather - 1 do */ - for ( j = 0; j < PWXgather; j++ ) - { -// Are these pointers or data? - __m256i xl = X[j][0][0]; - __m256i xh = X[j][0][1]; - __m256i (*p0)[2], (*p1)[2]; - - // 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) - -// playing with pointers -/* - p0 = S0 + (xl & Smask) / sizeof(*S0); - // 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) - p1 = S1 + (xh & Smask) / sizeof(*S1); -*/ - /* 5: for k = 0 to PWXsimple - 1 do */ - for ( k = 0; k < PWXsimple; k++ ) - { - -// shift from 32 bit data to 64 bit data - __m256i x0, x1, s00, s01, s10, s11; - __m128i *p0k = (__m128i*)p0[k]; - __m128i *p1k = (__m128i*)p1[k]; - - - s00 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[0] ), - _mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[2] ), 32 ) ); - s01 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[1] ), - _mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[3] ), 32 ) ); - s10 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[0] ), - _mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[2] ), 32 ) ); - s11 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[1] ), - _mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[3] ), 32 ) ); - - __m128i *xx = (__m128i*)X[j][k]; - x0 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[0] ), - _mm256_cvtepu32_epi64( xx[2] ) ); - x1 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[1] ), - _mm256_cvtepu32_epi64( xx[3] ) ); - - x0 = _mm256_add_epi64( x0, s00 ); - x1 = _mm256_add_epi64( x1, s01 ); - - x0 = _mm256_xor_si256( x0, s10 ); - x1 = _mm256_xor_si256( x1, s11 ); - - X[j][k][0] = x0; - X[j][k][1] = x1; - } - - if ( ctx->version != YESPOWER_0_5 && - ( i == 0 || j < PWXgather / 2 ) ) - { - if ( j & 1 ) - { - for ( k = 0; k < PWXsimple; k++ ) - { - S1[w][0] = X[j][k][0]; - S1[w][1] = X[j][k][1]; - w++; - } - } - else - { - for ( k = 0; k < PWXsimple; k++ ) - { - S0[w + k][0] = X[j][k][0]; - S0[w + k][1] = X[j][k][1]; - } - } - } - } - } - - if ( ctx->version != YESPOWER_0_5 ) - { - /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ - ctx->S0 = S2; - ctx->S1 = S0; - ctx->S2 = S1; - /* 15: w <-- w mod 2^Swidth */ - ctx->w = w & ( ( 1 << ctx->Swidth ) * PWXsimple - 1 ); - } -} - -/** - * blockmix_pwxform(B, ctx, r): - * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B). The input B must be - * 128r bytes in length. - */ -static void blockmix_pwxform_8way( uint32_t *B, pwxform_8way_ctx_t *ctx, - size_t r ) -{ - __m256i X[PWXwords]; - size_t r1, i; - - /* Convert 128-byte blocks to PWXbytes blocks */ - /* 1: r_1 <-- 128r / PWXbytes */ - r1 = 128 * r / PWXbytes; - - /* 2: X <-- B'_{r_1 - 1} */ - blkcpy_8way( X, &B[ (r1 - 1) * PWXwords ], PWXwords ); - - /* 3: for i = 0 to r_1 - 1 do */ - for ( i = 0; i < r1; i++ ) - { - /* 4: if r_1 > 1 */ - if ( r1 > 1 ) - { - /* 5: X <-- X xor B'_i */ - blkxor_8way( X, &B[ i * PWXwords ], PWXwords ); - } - - /* 7: X <-- pwxform(X) */ - pwxform_8way( X, ctx ); - - /* 8: B'_i <-- X */ - blkcpy_8way( &B[ i * PWXwords ], X, PWXwords ); - } - - /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ - i = ( r1 - 1 ) * PWXbytes / 64; - - /* 11: B_i <-- H(B_i) */ - salsa20_8way( &B[i * 16], ctx->salsa20_rounds ); - -#if 1 /* No-op with our current pwxform settings, but do it to make sure */ - /* 12: for i = i + 1 to 2r - 1 do */ - for ( i++; i < 2 * r; i++ ) - { - /* 13: B_i <-- H(B_i xor B_{i-1}) */ - blkxor_8way( &B[i * 16], &B[ (i - 1) * 16 ], 16 ); - salsa20_8way( &B[i * 16], ctx->salsa20_rounds ); - } -#endif -} - -// This looks a lot like data dependent addressing - -/** - * integerify(B, r): - * Return the result of parsing B_{2r-1} as a little-endian integer. - */ -static __m256i integerify8( const __m256i *B, size_t r ) -{ -/* - * Our 32-bit words are in host byte order. Also, they are SIMD-shuffled, but - * we only care about the least significant 32 bits anyway. - */ - const __m256i *X = &B[ (2 * r - 1) * 16 ]; - return X[0]; -} - -/** - * p2floor(x): - * Largest power of 2 not greater than argument. - */ -static uint32_t p2floor8( uint32_t x ) -{ - uint32_t y; - while ( ( y = x & (x - 1) ) ) - x = y; - return x; -} - -/** - * wrap(x, i): - * Wrap x to the range 0 to i-1. - */ -static uint32_t wrap8( uint32_t x, uint32_t i ) -{ - uint32_t n = p2floor( i ); - return ( x & (n - 1) ) + (i - n); -} - -/** - * smix1(B, r, N, V, X, ctx): - * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage X must be 128r bytes in length. - */ -static void smix1_8way( __m256i *B, size_t r, uint32_t N, - __m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx ) -{ - size_t s = 32 * r; - uint32_t i, j; - size_t k; - - /* 1: X <-- B */ - for ( k = 0; k < 2 * r; k++ ) - for ( i = 0; i < 16; i++ ) - X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ]; - - if ( ctx->version != YESPOWER_0_5 ) - { - for ( k = 1; k < r; k++ ) - { - blkcpy_8way( &X[k * 32], &X[ (k - 1) * 32 ], 32 ); - blockmix_pwxform_8way( &X[k * 32], ctx, 1 ); - } - } - - /* 2: for i = 0 to N - 1 do */ - for ( i = 0; i < N; i++ ) - { - /* 3: V_i <-- X */ - blkcpy_8way( &V[i * s], X, s ); - - if ( i > 1 ) - { - -// is j int or vector? Integrify has data dependent addressing? - - /* j <-- Wrap(Integerify(X), i) */ -// j = wrap8( integerify8( X, r ), i ); - - /* X <-- X xor V_j */ - blkxor_8way( X, &V[j * s], s ); - } - - /* 4: X <-- H(X) */ - if ( V != ctx->S ) - blockmix_pwxform_8way( X, ctx, r ); - else - blockmix_salsa_8way( X, ctx->salsa20_rounds ); - } - - /* B' <-- X */ - for ( k = 0; k < 2 * r; k++ ) - for ( i = 0; i < 16; i++ ) - B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ]; -} - -/** - * smix2(B, r, N, Nloop, V, X, ctx): - * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in - * length; the temporary storage V must be 128rN bytes in length; the temporary - * storage X must be 128r bytes in length. The value N must be a power of 2 - * greater than 1. - */ -static void smix2_8way( __m256i *B, size_t r, uint32_t N, uint32_t Nloop, - __m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx ) -{ - size_t s = 32 * r; - uint32_t i, j; - size_t k; - - /* X <-- B */ - for ( k = 0; k < 2 * r; k++ ) - for ( i = 0; i < 16; i++ ) - X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ]; - - /* 6: for i = 0 to N - 1 do */ - for ( i = 0; i < Nloop; i++ ) - { - /* 7: j <-- Integerify(X) mod N */ -// j = integerify8(X, r) & (N - 1); - - /* 8.1: X <-- X xor V_j */ - blkxor_8way( X, &V[j * s], s ); - /* V_j <-- X */ - if ( Nloop != 2 ) - blkcpy_8way( &V[j * s], X, s ); - - /* 8.2: X <-- H(X) */ - blockmix_pwxform_8way( X, ctx, r ); - } - - /* 10: B' <-- X */ - for ( k = 0; k < 2 * r; k++ ) - for ( i = 0; i < 16; i++ ) - B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ]; -} - -/** - * smix(B, r, N, p, t, V, X, ctx): - * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the - * temporary storage V must be 128rN bytes in length; the temporary storage - * X must be 128r bytes in length. The value N must be a power of 2 and at - * least 16. - */ -static void smix_8way( __m256i *B, size_t r, uint32_t N, - __m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx) -{ - uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ - uint32_t Nloop_rw = Nloop_all; - - Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ - - if ( ctx->version == YESPOWER_0_5 ) - Nloop_rw &= ~(uint32_t)1; /* round down to even */ - else - Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ - - smix1_8way( B, 1, ctx->Sbytes / 128, ctx->S, X, ctx ); - smix1_8way( B, r, N, V, X, ctx ); - smix2_8way( B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx ); - smix2_8way( B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx ); -} - -/** - * yespower(local, src, srclen, params, dst): - * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". - * - * Return 0 on success; or -1 on error. - */ -int yespower_8way( yespower_local_t *local, const __m256i *src, size_t srclen, - const yespower_params_t *params, yespower_8way_binary_t *dst, - int thrid ) -{ - yespower_version_t version = params->version; - uint32_t N = params->N; - uint32_t r = params->r; - const uint8_t *pers = params->pers; - size_t perslen = params->perslen; - int retval = -1; - size_t B_size, V_size; - uint32_t *B, *V, *X, *S; - pwxform_8way_ctx_t ctx; - __m256i sha256[8]; - - /* Sanity-check parameters */ - if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0 ) || - N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || - (N & (N - 1)) != 0 || r < rmin || - (!pers && perslen) ) - { - errno = EINVAL; - return -1; - } - - /* Allocate memory */ - B_size = (size_t)128 * r; - V_size = B_size * N; - if ((V = malloc(V_size)) == NULL) - return -1; - if ((B = malloc(B_size)) == NULL) - goto free_V; - if ((X = malloc(B_size)) == NULL) - goto free_B; - ctx.version = version; - if (version == YESPOWER_0_5) { - ctx.salsa20_rounds = 8; - ctx.PWXrounds = PWXrounds_0_5; - ctx.Swidth = Swidth_0_5; - ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth); - } else { - ctx.salsa20_rounds = 2; - ctx.PWXrounds = PWXrounds_1_0; - ctx.Swidth = Swidth_1_0; - ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth); - } - if ((S = malloc(ctx.Sbytes)) == NULL) - goto free_X; - ctx.S = S; - ctx.S0 = (__m256i (*)[2])S; - ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple; - ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple; - ctx.Smask = Swidth_to_Smask(ctx.Swidth); - ctx.w = 0; - - // do prehash - sha256_8way_full( sha256, src, srclen ); - - - // need flexible size, use malloc; - __m256i vpers[128]; - - if ( version != YESPOWER_0_5 && perslen ) - for ( int i = 0; i < perslen/4 + 1; i++ ) - vpers[i] = _mm256_set1_epi32( pers[i] ); - - /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ - pbkdf2_sha256_8way( B, B_size, sha256, sizeof(sha256), vpers, perslen, 1 ); - - blkcpy_8way( sha256, B, sizeof(sha256) / sizeof(sha256[0] ) ); - - /* 3: B_i <-- MF(B_i, N) */ - smix_8way( B, r, N, V, X, &ctx ); - - if ( version == YESPOWER_0_5 ) - { - /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ - pbkdf2_sha256_8way( dst, sizeof(*dst), sha256, sizeof(sha256), - B, B_size, 1 ); - - if ( pers ) - { - hmac_sha256_8way_full( dst, sizeof(*dst), vpers, perslen, sha256 ); - sha256_8way_full( dst, sha256, sizeof(sha256) ); - } - } - else - hmac_sha256_8way_full( dst, B + B_size - 64, 64, sha256, sizeof(sha256) ); - - /* Success! */ - retval = 1; - - /* Free memory */ - free(S); -free_X: - free(X); -free_B: - free(B); -free_V: - free(V); - - return retval; -} - -int yespower_8way_tls( const __m256i *src, size_t srclen, - const yespower_params_t *params, yespower_8way_binary_t *dst, int trhid ) -{ -/* The reference implementation doesn't use thread-local storage */ - return yespower_8way( NULL, src, srclen, params, dst, trhid ); -} - -int yespower_init_local8( yespower_local_t *local ) -{ -/* The reference implementation doesn't use the local structure */ - local->base = local->aligned = NULL; - local->base_size = local->aligned_size = 0; - return 0; -} - -int yespower_free_local8( yespower_local_t *local ) -{ -/* The reference implementation frees its memory in yespower() */ - (void)local; /* unused */ - return 0; -} - -int yespower_8way_hash( const char *input, char *output, uint32_t len, - int thrid ) -{ - return yespower_8way_tls( input, len, &yespower_params, - (yespower_binary_t*)output, thrid ); -} - -int scanhash_yespower_8way( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(128) hash[8*8]; - uint32_t _ALIGN(128) vdata[20*8]; - uint32_t _ALIGN(128) endiandata[20]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - const uint32_t last_nonce = max_nonce; - uint32_t n = first_nonce; - const int thr_id = mythr->id; - - for ( int k = 0; k < 19; k++ ) - be32enc( &endiandata[k], pdata[k] ); - endiandata[19] = n; - -// do sha256 prehash - SHA256_Init( &sha256_prehash_ctx ); - SHA256_Update( &sha256_prehash_ctx, endiandata, 64 ); - - do { - if ( yespower_hash( vdata, hash, 80, thr_id ) ) - if unlikely( valid_hash( hash, ptarget ) && !opt_benchmark ) - { - be32enc( pdata+19, n ); - submit_solution( work, hash, mythr ); - } - endiandata[19] = ++n; - } while ( n < last_nonce && !work_restart[thr_id].restart ); - *hashes_done = n - first_nonce; - pdata[19] = n; - return 0; -} - -#endif // AVX2 diff --git a/armbuild-all.sh b/armbuild-all.sh index 2655dba..e77cf79 100755 --- a/armbuild-all.sh +++ b/armbuild-all.sh @@ -1,7 +1,7 @@ #!/bin/bash # # This script is not intended for users, it is only used for compile testing -# during develpment. However the information contained may provide compilation +# during development. However, the information contained may provide compilation # tips to users. rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null diff --git a/build-allarch.sh b/build-allarch.sh index 9911d4f..dba2afc 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -10,9 +10,9 @@ rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer make distclean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl +#CFLAGS="-O3 -march=icelake-client -Wall" ./configure --with-curl # Rocketlake needs gcc-11 -#CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl +CFLAGS="-O3 -march=rocketlake -Wall" ./configure --with-curl make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx512-sha-vaes @@ -21,7 +21,7 @@ mv cpuminer cpuminer-avx512-sha-vaes #make clean || echo clean #rm -f config.status #CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl -#make -j 8 +#make -j $(nproc) #strip -s cpuminer #mv cpuminer cpuminer-alderlake @@ -30,35 +30,43 @@ mv cpuminer cpuminer-avx512-sha-vaes #make clean || echo clean #rm -f config.status #CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl -#make -j 8 +#make -j $(nproc) #strip -s cpuminer #mv cpuminer cpuminer-arrowlake-s # Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14 -# Apparently Granitrapids will not include AVX10, SHA512 or APX, +# Granitrapids does not build with AVX10, SHA512 or APX. # wait for Diamondrapids & gcc-15. #make clean || echo clean #rm -f config.status #CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl -#make -j 8 +#make -j $(nproc) #strip -s cpuminer #mv cpuminer cpuminer-graniterapids -# Force AVX10-256 +# SHA512 AVX10.1 #make clean || echo clean #rm -f config.status -#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl -#make -j 8 +#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl +#make -j $(nproc) #strip -s cpuminer -#mv cpuminer cpuminer-avx10-256 +#mv cpuminer cpuminer-avx10_1 -# Force SHA512 AVX10-512 +# SHA512 AVX10.2 #make clean || echo clean #rm -f config.status -#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl -#make -j 8 +#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.2 -Wall" ./configure --with-curl +#make -j $(nproc) #strip -s cpuminer -#mv cpuminer cpuminer-avx10-512 +#mv cpuminer cpuminer-avx10_2 + +# Diamondrapids: AVX10.2, SHA512, APX; needs GCC-15 & CPU with APX to compile. +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=diamondrapids -Wall" ./configure --with-curl +#make -j $(nproc) +#strip -s cpuminer +#mv cpuminer cpuminer-diamondrapids # Zen5: AVX512 SHA VAES, requires gcc-14. #make clean || echo clean @@ -71,11 +79,10 @@ mv cpuminer cpuminer-avx512-sha-vaes # Zen4: AVX512 SHA VAES make clean || echo clean rm -f config.status -# znver3 needs gcc-11, znver4 needs gcc-12.3. +# Zen4: AVX512, SHA, VAES, needs gcc-12.3. #CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer. CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl -#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall" ./configure --with-curl make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-zen4 @@ -83,7 +90,6 @@ mv cpuminer cpuminer-zen4 # Zen3 AVX2 SHA VAES make clean || echo clean rm -f config.status -#CFLAGS="-O3 -march=znver2 -mvaes" ./configure --with-curl CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl make -j $(nproc) strip -s cpuminer @@ -159,7 +165,7 @@ mv cpuminer cpuminer-ssse3 # SSE2 make clean || echo clean rm -f config.status -CFLAGS="-O3 -msse2 -Wall" ./configure --with-curl +CFLAGS="-O3 -march=x86-64 -msse2 -Wall" ./configure --with-curl make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-sse2 diff --git a/configure b/configure index fc3d3cd..ba07e11 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.3. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.4. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.3' -PACKAGE_STRING='cpuminer-opt 25.3' +PACKAGE_VERSION='25.4' +PACKAGE_STRING='cpuminer-opt 25.4' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 25.3 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1431,7 +1431,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.3:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.4:";; esac cat <<\_ACEOF @@ -1536,7 +1536,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.3 +cpuminer-opt configure 25.4 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.3, which was +It was created by cpuminer-opt $as_me 25.4, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3591,7 +3591,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.3' + VERSION='25.4' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.3, which was +This file was extended by cpuminer-opt $as_me 25.4, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.3 +cpuminer-opt config.status 25.4 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 859da80..bcd8c00 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [25.3]) +AC_INIT([cpuminer-opt], [25.4]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index 01c5176..47fb362 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.3. +# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.4. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation, @@ -601,8 +601,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.3' -PACKAGE_STRING='cpuminer-opt 25.3' +PACKAGE_VERSION='25.4' +PACKAGE_STRING='cpuminer-opt 25.4' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -'configure' configures cpuminer-opt 25.3 to adapt to many kinds of systems. +'configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1424,7 +1424,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.3:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.4:";; esac cat <<\_ACEOF @@ -1528,7 +1528,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.3 +cpuminer-opt configure 25.4 generated by GNU Autoconf 2.72 Copyright (C) 2023 Free Software Foundation, Inc. @@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.3, which was +It was created by cpuminer-opt $as_me 25.4, which was generated by GNU Autoconf 2.72. Invocation command line was $ $0$ac_configure_args_raw @@ -3764,7 +3764,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.3' + VERSION='25.4' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7450,7 +7450,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.3, which was +This file was extended by cpuminer-opt $as_me 25.4, which was generated by GNU Autoconf 2.72. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7518,7 +7518,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.3 +cpuminer-opt config.status 25.4 configured by $0, generated by GNU Autoconf 2.72, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index ff6a590..0937062 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -152,7 +152,7 @@ int stratum_thr_id = -1; int api_thr_id = -1; bool stratum_need_reset = false; struct work_restart *work_restart = NULL; -struct stratum_ctx stratum; +struct stratum_ctx stratum = {0}; double opt_diff_factor = 1.0; double opt_target_factor = 1.0; uint32_t zr5_pok = 0; @@ -187,7 +187,7 @@ static bool opt_api_enabled = false; char *opt_api_allow = NULL; int opt_api_listen = 0; int opt_api_remote = 0; -char *default_api_allow = "127.0.0.1"; +const char *default_api_allow = "127.0.0.1"; int default_api_listen = 4048; pthread_mutex_t applog_lock; @@ -870,9 +870,9 @@ static bool gbt_work_decode( const json_t *val, struct work *work ) work->tx_count = tx_count; /* assemble block header */ - algo_gate.build_block_header( work, swab32( version ), + algo_gate.build_block_header( work, bswap_32( version ), (uint32_t*) prevhash, (uint32_t*) merkle_tree, - swab32( curtime ), le32dec( &bits ), + bswap_32( curtime ), le32dec( &bits ), final_sapling_hash ); if ( unlikely( !jobj_binary( val, "target", target, sizeof(target) ) ) ) @@ -1773,7 +1773,7 @@ static bool get_work(struct thr_info *thr, struct work *work) // why 74? std cmp_size is 76, std data is 128 for ( int n = 0; n < 74; n++ ) ( (char*)work->data )[n] = n; - work->data[algo_gate.ntime_index] = swab32(ts); // ntime + work->data[algo_gate.ntime_index] = bswap_32(ts); // ntime // this overwrites much of the for loop init memset( work->data + algo_gate.nonce_index, 0x00, 52); // nonce..nonce+52 @@ -2009,36 +2009,37 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) { bool new_job; - pthread_rwlock_wrlock( &g_work_lock ); pthread_mutex_lock( &sctx->work_lock ); new_job = sctx->new_job; // otherwise just increment extranonce2 sctx->new_job = false; + + pthread_rwlock_wrlock( &g_work_lock ); free( g_work->job_id ); g_work->job_id = strdup( sctx->job.job_id ); g_work->xnonce2_len = sctx->xnonce2_size; g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size ); + g_work->height = sctx->block_height; + g_work->targetdiff = sctx->job.diff + / ( opt_target_factor * opt_diff_factor ); memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size ); algo_gate.build_extraheader( g_work, sctx ); net_diff = nbits_to_diff( g_work->data[ algo_gate.nbits_index ] ); algo_gate.set_work_data_endian( g_work ); - g_work->height = sctx->block_height; - g_work->targetdiff = sctx->job.diff - / ( opt_target_factor * opt_diff_factor ); diff_to_hash( g_work->target, g_work->targetdiff ); + g_work_time = time(NULL); + restart_threads(); + pthread_rwlock_unlock( &g_work_lock ); + // Pre increment extranonce2 in case of being called again before receiving // a new job for ( int t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - g_work_time = time(NULL); - restart_threads(); - pthread_mutex_unlock( &sctx->work_lock ); - pthread_rwlock_unlock( &g_work_lock ); pthread_mutex_lock( &stats_lock ); @@ -2072,7 +2073,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) // Update data and calculate new estimates. if ( ( stratum_diff != sctx->job.diff ) - || ( last_block_height != sctx->block_height ) ) + || ( last_block_height != sctx->block_height ) ) { if ( unlikely( !session_first_block ) ) session_first_block = stratum.block_height; @@ -2189,7 +2190,7 @@ static void *miner_thread( void *userdata ) } // wait for stratum to send first job - if ( have_stratum ) while ( unlikely( stratum_down ) ) + if ( have_stratum ) while ( unlikely( !stratum.job.job_id ) ) { if ( opt_debug ) applog( LOG_INFO, "Thread %d waiting for first job", thr_id ); @@ -2203,7 +2204,6 @@ static void *miner_thread( void *userdata ) { uint64_t hashes_done; struct timeval tv_start, tv_end, diff; -// int64_t max64 = 1000; int nonce_found = 0; if ( have_stratum ) @@ -2229,13 +2229,6 @@ static void *miner_thread( void *userdata ) } else if ( !opt_benchmark ) // GBT or getwork { - // max64 is used to set end_nonce to match the scantime. - // It also factors the nonce range to end the scan when nonces are - // exhausted. In either case needing new work can be assumed. - // Only problem is every thread will call get_work. - // First thread resets scantime blocking all subsequent threads - // from fetching new work. - pthread_rwlock_wrlock( &g_work_lock ); const time_t now = time(NULL); if ( ( ( now - g_work_time ) >= opt_scantime ) @@ -2872,8 +2865,7 @@ static bool cpu_capability( bool display_only ) bool sw_has_avx = false; bool sw_has_avx2 = false; bool sw_has_avx512 = false; - bool sw_has_avx10_256 = false; - bool sw_has_avx10_512 = false; + bool sw_has_avx10 = false; bool sw_has_aes = false; bool sw_has_vaes = false; bool sw_has_sha256 = false; // x86_64 or AArch64 @@ -2936,11 +2928,13 @@ static bool cpu_capability( bool display_only ) #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) sw_has_avx512 = true; #endif - #if defined(__AVX10_1_256__) - sw_has_avx10_256 = true; - #endif - #if defined(__AVX10_1_512__) - sw_has_avx10_512 = true; +// AVX10 version is not significant as of AVX10.2. If that changes use a better +// way to test the version than sequentially. +// #if defined(__AVX10_2__) +// +// #elif defined(__AVX10_1__) + #if defined(__AVX10_1__) + sw_has_avx10 = true; #endif // x86_64 or AArch64 @@ -3008,8 +3002,7 @@ static bool cpu_capability( bool display_only ) printf("CPU features: "); if ( cpu_arch_x86_64() ) { - if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(), - avx10_vector_length() ); + if ( cpu_has_avx10 ) printf( " AVX10.%d", avx10_version() ); if ( cpu_has_avx512 ) printf( " AVX512" ); else if ( cpu_has_avx2 ) printf( " AVX2 " ); else if ( cpu_has_avx ) printf( " AVX " ); @@ -3034,8 +3027,7 @@ static bool cpu_capability( bool display_only ) printf("\nSW features: "); if ( sw_has_x86_64 ) { - if ( sw_has_avx10_512 ) printf( " AVX10-512" ); - else if ( sw_has_avx10_256 ) printf( " AVX10-256" ); + if ( sw_has_avx10 ) printf( " AVX10 " ); else if ( sw_has_avx512 ) printf( " AVX512" ); else if ( sw_has_avx2 ) printf( " AVX2 " ); else if ( sw_has_avx ) printf( " AVX " ); @@ -3060,122 +3052,9 @@ static bool cpu_capability( bool display_only ) printf("\n"); -/* - if ( !display_only ) - { - printf("\nAlgo features:"); - if ( algo_features == EMPTY_SET ) printf( " None" ); - else - { - if ( algo_has_avx512 ) printf( " AVX512" ); - else if ( algo_has_avx2 ) printf( " AVX2 " ); - else if ( algo_has_sse42 ) printf( " SSE4.2" ); - else if ( algo_has_sse2 ) printf( " SSE2 " ); - if ( algo_has_neon ) printf( " NEON" ); - if ( algo_has_vaes ) printf( " VAES" ); - else if ( algo_has_aes ) printf( " AES" ); - if ( algo_has_sha512 ) printf( " SHA512" ); - else if ( algo_has_sha256 ) printf( " SHA256" ); - } - } - printf("\n"); - - - if ( display_only ) return true; - - // Determine mining options - use_sse2 = cpu_has_sse2 && sw_has_sse2 && algo_has_sse2; - use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42; - use_avx = cpu_has_avx && sw_has_avx && algo_has_avx; - use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; - use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512; - use_aes = cpu_has_aes && sw_has_aes && algo_has_aes; - use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes; - use_sha256 = cpu_has_sha256 && sw_has_sha256 && algo_has_sha256; - use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512; - use_neon = sw_has_aarch64 && sw_has_neon && algo_has_neon; - use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512 - || use_avx2 || use_sha256 || use_vaes || use_sha512 || use_neon ); - - // Display best options - if ( !use_none ) - { - applog_nl( "Enabled optimizations:" ); - if ( use_neon ) printf( " NEON" ); - if ( use_avx512 ) printf( " AVX512" ); - else if ( use_avx2 ) printf( " AVX2" ); - else if ( use_avx ) printf( " AVX" ); - else if ( use_sse42 ) printf( " SSE42" ); - else if ( use_sse2 ) printf( " SSE2" ); - if ( use_vaes ) printf( " VAES" ); - else if ( use_aes ) printf( " AES" ); - if ( use_sha512 ) printf( " SHA512" ); - else if ( use_sha256 ) printf( " SHA256" ); - printf( "\n" ); - } -*/ - return true; } -/* -void show_version_and_exit(void) -{ - printf("\n built on " __DATE__ -#ifdef _MSC_VER - " with VC++ 2013\n"); -#elif defined(__GNUC__) - " with GCC"); - printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); -#endif - printf(" features:" -#if defined(USE_ASM) && defined(__i386__) - " i386" -#endif -#if defined(USE_ASM) && defined(__x86_64__) - " x86_64" -#endif -#if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__)) - " SSE2" -#endif -#if defined(__x86_64__) && defined(USE_AVX) - " AVX" -#endif -#if defined(__x86_64__) && defined(USE_AVX2) - " AVX2" -#endif -#if defined(__x86_64__) && defined(USE_XOP) - " XOP" -#endif -#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) - " ARM" -#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) - " ARMv5E" -#endif -#if defined(__ARM_NEON__) - " NEON" -#endif -#endif - "\n\n"); - - printf("%s\n", curl_version()); -#ifdef JANSSON_VERSION - printf("jansson/%s ", JANSSON_VERSION); -#endif -#ifdef PTW32_VERSION - printf("pthreads/%d.%d.%d.%d ", PTW32_VERSION); -#endif - printf("\n"); - exit(0); -} -*/ void show_usage_and_exit(int status) { if (status) @@ -3253,7 +3132,7 @@ void parse_arg(int key, char *arg ) else if ( arg ) { /* port or 0 to disable */ - opt_api_allow = default_api_allow; + opt_api_allow = (char*)default_api_allow; opt_api_listen = atoi(arg); } break; @@ -3289,7 +3168,7 @@ void parse_arg(int key, char *arg ) // debug overrides quiet case 'q': // quiet - if ( !( opt_debug || opt_protocol ) ) opt_quiet = true; + opt_quiet = !( opt_debug || opt_protocol ); break; case 'D': // debug opt_debug = true; diff --git a/miner.h b/miner.h index 9aa13f2..7bd508f 100644 --- a/miner.h +++ b/miner.h @@ -8,8 +8,8 @@ #define USER_AGENT_ARCH "x64" // Intel, AMD x86_64 #elif defined(__aarch64__) #define USER_AGENT_ARCH "arm" // AArch64 -//#elif -// #define USER_AGENT_ARCH "r5" // RISC-V +#elif defined(__riscv) + #define USER_AGENT_ARCH "rv" // RISC-V #else #define USER_AGENT_ARCH #endif @@ -65,7 +65,7 @@ # endif #endif -// no mm_maloc for Neon +// no mm_malloc for Neon #if !defined(__ARM_NEON) #include @@ -173,6 +173,7 @@ static inline bool is_windows(void) #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif +#if 0 // deprecated, see simd-int.h #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) #define WANT_BUILTIN_BSWAP @@ -183,6 +184,7 @@ static inline bool is_windows(void) */ #endif +/* static inline uint32_t swab32(uint32_t x) { #ifdef WANT_BUILTIN_BSWAP @@ -195,6 +197,8 @@ static inline uint32_t swab32(uint32_t x) // return bswap_32(v); #endif } +*/ +#endif // Swap any two variables of the same type without using a temp #define swap_vars(a,b) a^=b; b^=a; a^=b; diff --git a/simd-utils.h b/simd-utils.h index f199e56..45f50cf 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -140,28 +140,28 @@ #include #include -// AVX512 macros are not a reliable indicator of 512 bit vector capability -// because they get defined with AVX10_1_256 which doesn't support 512 bit. -// EVEX512 is also unreliable as it can also be defined when 512b is not -// available. -// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present. -// Use AVX512 macros only whithout AVX10. - /* -// Test for macros -#ifdef __AVX10_1__ +// Test for AVX10 macros +// AVX10-256 was abandoned by Intel before any CPUs were built. +#ifdef __AVX10__ // does not exist +#warning "__AVX10__" +#endif +#ifdef __AVX10_1__ // GCC-14 #warning "__AVX10_1__" #endif -#ifdef __AVX10_1_256__ +#ifdef __AVX10_2__ // GCC-15 +#warning "__AVX10_2__" +#endif +#ifdef __AVX10_1_256__ // obsolete #warning "__AVX10_1_256__" #endif -#ifdef __AVX10_1_512__ -#warning "__AVX10_1_512__" +#ifdef __AVX10_1_512__ +#warning "__AVX10_1_512__" // does not exist #endif -#ifdef __EVEX256__ -#warning "__EVEX256__" +#ifdef __EVEX256__ // likely obsolete +#warning "__EVEX256__" #endif -#ifdef __EVEX512__ +#ifdef __EVEX512__ // likely obsolete #warning "__EVEX512__" #endif #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -169,27 +169,14 @@ #endif */ -// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and -// must be tested seperately. -// VL256: Include AVX512VL instructions for 256 & 128 bit vectors. -// VBMI: Include AVX512VBMI instructions for supported vector lengths. - -#if defined(__AVX10_1__) - - #define VL256 1 - #define VBMI 1 - #if defined(__AVX10_1_512__) - #define SIMD512 1 - #endif - -#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - - #define VL256 1 +// With Intel abandoning AVX10-256 the SIM512 & VL256 macros are almost +// identical with the only difference being VBMI is included in VL256. +#if defined(__AVX10_1__) || ( defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) ) #define SIMD512 1 - #if defined(__AVX512VBMI__) + #define VL256 1 + #if defined(__AVX10_1__) || defined(__AVX512VBMI__) #define VBMI 1 #endif - #endif /* @@ -204,13 +191,75 @@ #endif */ +// targetted intrinsics #if defined(__x86_64__) + #include +#elif defined(__aarch64__) && defined(__ARM_NEON) + #include +#elif defined(__riscv) && defined(__riscv_vector) + #include +#endif -#include +// Single global definition for frequently used vector constants. +// The GCC optimizer can merge constants but merging different vector lengths +// might be beyond it's scope. -#elif defined(__aarch64__) +// Frequently used SSE/AVX shuffle constants. +#if defined(SIMD512) -#include +// When used with shuffle_epi8 performs are standard bswap of all elements. +// When used with permutexvar_epi8 (requires AVX512VBMI or AVX10) performs a +// bswap of the elements in the lower 128 bits of the source and broadcasts +// the result to all 128 bit lanes of the destination. + +extern const __m512i V512_BSWAP64; +#define V256_BSWAP64 _mm512_castsi512_si256( V512_BSWAP64 ) +#define V128_BSWAP64 _mm512_castsi512_si128( V512_BSWAP64 ) + +extern const __m512i V512_BSWAP32; +#define V256_BSWAP32 _mm512_castsi512_si256( V512_BSWAP32 ) +#define V128_BSWAP32 _mm512_castsi512_si128( V512_BSWAP32 ) + +#elif defined(__AVX2__) + +extern const __m256i V256_BSWAP64; +#define V128_BSWAP64 _mm256_castsi256_si128( V256_BSWAP64 ) + +extern const __m256i V256_BSWAP32; +#define V128_BSWAP32 _mm256_castsi256_si128( V256_BSWAP32 ) + +// These shuffles aren't needed with AVX512, uses ror/rol instead. + +extern const __m256i V256_SHUFLR64_8; +#define V128_SHUFLR64_8 _mm256_castsi256_si128( V256_SHUFLR64_8 ) + +extern const __m256i V256_SHUFLR64_24; +#define V128_SHUFLR64_24 _mm256_castsi256_si128( V256_SHUFLR64_24 ) + +extern const __m256i V256_SHUFLL64_8; +#define V128_SHUFLL64_8 _mm256_castsi256_si128( V256_SHUFLL64_8 ) + +extern const __m256i V256_SHUFLL64_24; +#define V128_SHUFLL64_24 _mm256_castsi256_si128( V256_SHUFLL64_24 ) + +extern const __m256i V256_SHUFLR32_8; +#define V128_SHUFLR32_8 _mm256_castsi256_si128( V256_SHUFLR32_8 ) + +extern const __m256i V256_SHUFLL32_8; +#define V128_SHUFLL32_8 _mm256_castsi256_si128( V256_SHUFLL32_8 ) + +#elif defined(__SSSE3__) + +extern const __m128i V128_BSWAP64; +extern const __m128i V128_BSWAP32; + +extern const __m128i V128_SHUFLR64_8; +extern const __m128i V128_SHUFLR64_24; +extern const __m128i V128_SHUFLL64_8; +extern const __m128i V128_SHUFLL64_24; + +extern const __m128i V128_SHUFLR32_8; +extern const __m128i V128_SHUFLL32_8; #endif @@ -225,7 +274,7 @@ // x86_64 AVX512 512 bit vectors #include "simd-utils/simd-512.h" -// aarch64 neon 128 bit vectors +// aarch64 NEON 128 bit vectors #include "simd-utils/simd-neon.h" #include "simd-utils/intrlv.h" diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 5ea5507..bba9edf 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -589,20 +589,7 @@ static inline void extr_lane_4x32( void *d, const void *s, ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+60 ]; } -#if defined(__SSSE3__) - -static inline void v128_bswap32_80( void *d, void *s ) -{ - const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); - casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), bswap_shuf ); - casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), bswap_shuf ); - casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), bswap_shuf ); - casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), bswap_shuf ); - casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), bswap_shuf ); -} - -#elif defined(__aarch64__) && defined(__ARM_NEON) +#if defined(__SSSE3__) || defined(__ARM_NEON) static inline void v128_bswap32_80( void *d, void *s ) { @@ -641,6 +628,8 @@ static inline void v128_bswap32_80( void *d, void *s ) #endif +#if defined(__SSE2__) || defined(__ARM_NEON) + static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) { v128u32_t s0 = casti_v128u32( src,0 ); @@ -649,27 +638,12 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) v128u32_t s3 = casti_v128u32( src,3 ); v128u32_t s4 = casti_v128u32( src,4 ); -#if defined(__SSSE3__) - - const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); - - s0 = _mm_shuffle_epi8( s0, bswap_shuf ); - s1 = _mm_shuffle_epi8( s1, bswap_shuf ); - s2 = _mm_shuffle_epi8( s2, bswap_shuf ); - s3 = _mm_shuffle_epi8( s3, bswap_shuf ); - s4 = _mm_shuffle_epi8( s4, bswap_shuf ); - -#else - s0 = v128_bswap32( s0 ); s1 = v128_bswap32( s1 ); s2 = v128_bswap32( s2 ); s3 = v128_bswap32( s3 ); s4 = v128_bswap32( s4 ); -#endif - casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 ); casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 ); casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 ); @@ -696,6 +670,8 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) casti_v128u32( d,19 ) = v128_duplane32( s2, 3 ); } +#endif // SSE2 || NEON + // 8x32 #if defined(__AVX2__) @@ -1112,8 +1088,6 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) { - const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); const __m256i c1 = v256_32( 1 ); const __m256i c2 = _mm256_add_epi32( c1, c1 ); const __m256i c3 = _mm256_add_epi32( c2, c1 ); @@ -1124,11 +1098,11 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src ) v128_t s3 = casti_v128( src,3 ); v128_t s4 = casti_v128( src,4 ); - s0 = _mm_shuffle_epi8( s0, bswap_shuf ); - s1 = _mm_shuffle_epi8( s1, bswap_shuf ); - s2 = _mm_shuffle_epi8( s2, bswap_shuf ); - s3 = _mm_shuffle_epi8( s3, bswap_shuf ); - s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + s0 = v128_bswap32( s0 ); + s1 = v128_bswap32( s1 ); + s2 = v128_bswap32( s2 ); + s3 = v128_bswap32( s3 ); + s4 = v128_bswap32( s4 ); casti_m256i( d, 0 ) = _mm256_broadcastd_epi32( s0 ); casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( @@ -1617,8 +1591,6 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) { - const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); const __m512i c1 = v512_32( 1 ); const __m512i c2 = _mm512_add_epi32( c1, c1 ); const __m512i c3 = _mm512_add_epi32( c2, c1 ); @@ -1628,11 +1600,11 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) v128_t s3 = casti_v128( src,3 ); v128_t s4 = casti_v128( src,4 ); - s0 = _mm_shuffle_epi8( s0, bswap_shuf ); - s1 = _mm_shuffle_epi8( s1, bswap_shuf ); - s2 = _mm_shuffle_epi8( s2, bswap_shuf ); - s3 = _mm_shuffle_epi8( s3, bswap_shuf ); - s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + s0 = v128_bswap32( s0 ); + s1 = v128_bswap32( s1 ); + s2 = v128_bswap32( s2 ); + s3 = v128_bswap32( s3 ); + s4 = v128_bswap32( s4 ); casti_m512i( d, 0 ) = _mm512_broadcastd_epi32( s0 ); casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( c1, @@ -1878,6 +1850,8 @@ static inline void dintrlv_2x64( void *dst0, void *dst1, #endif +#if defined(__SSE2__) || defined(__ARM_NEON) + static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src ) { v128u64_t s0 = casti_v128u64( src,0 ); @@ -1886,27 +1860,12 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src ) v128u64_t s3 = casti_v128u64( src,3 ); v128u64_t s4 = casti_v128u64( src,4 ); -#if defined(__SSSE3__) - - const v128u64_t bswap_shuf = v128_set64( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); - - s0 = _mm_shuffle_epi8( s0, bswap_shuf ); - s1 = _mm_shuffle_epi8( s1, bswap_shuf ); - s2 = _mm_shuffle_epi8( s2, bswap_shuf ); - s3 = _mm_shuffle_epi8( s3, bswap_shuf ); - s4 = _mm_shuffle_epi8( s4, bswap_shuf ); - -#else - s0 = v128_bswap32( s0 ); s1 = v128_bswap32( s1 ); s2 = v128_bswap32( s2 ); s3 = v128_bswap32( s3 ); s4 = v128_bswap32( s4 ); -#endif - casti_v128u64( d,0 ) = v128_duplane64( s0, 0 ); casti_v128u64( d,1 ) = v128_duplane64( s0, 1 ); @@ -1923,6 +1882,8 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src ) casti_v128u64( d,9 ) = v128_duplane64( s4, 1 ); } +#endif // SSE2 || NEON + static inline void extr_lane_2x64( void *dst, const void *src, const int lane, const int bit_len ) { @@ -2233,25 +2194,23 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { - const __m256i bswap_shuf = mm256_bcast_m128( - _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); - __m256i s0 = casti_m256i( src,0 ); - __m256i s1 = casti_m256i( src,1 ); + __m256i s0 = casti_m256i( src,0 ); // s0, s1 + __m256i s2 = casti_m256i( src,1 ); // s2, s3 v128_t s4 = casti_v128( src,4 ); - s0 = _mm256_shuffle_epi8( s0, bswap_shuf ); - s1 = _mm256_shuffle_epi8( s1, bswap_shuf ); - s4 = _mm_shuffle_epi8( s4, _mm256_castsi256_si128( bswap_shuf ) ); + s0 = mm256_bswap_32( s0 ); + s2 = mm256_bswap_32( s2 ); + s4 = v128_bswap32( s4 ); casti_m256i( d, 0 ) = _mm256_permute4x64_epi64( s0, 0x00 ); casti_m256i( d, 1 ) = _mm256_permute4x64_epi64( s0, 0x55 ); casti_m256i( d, 2 ) = _mm256_permute4x64_epi64( s0, 0xaa ); casti_m256i( d, 3 ) = _mm256_permute4x64_epi64( s0, 0xff ); - casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s1, 0x00 ); - casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s1, 0x55 ); - casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s1, 0xaa ); - casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s1, 0xff ); + casti_m256i( d, 4 ) = _mm256_permute4x64_epi64( s2, 0x00 ); + casti_m256i( d, 5 ) = _mm256_permute4x64_epi64( s2, 0x55 ); + casti_m256i( d, 6 ) = _mm256_permute4x64_epi64( s2, 0xaa ); + casti_m256i( d, 7 ) = _mm256_permute4x64_epi64( s2, 0xff ); casti_m256i( d, 8 ) = _mm256_permute4x64_epi64( _mm256_castsi128_si256( s4 ), 0x00 ); @@ -2648,8 +2607,6 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) { - const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); const __m512i c1 = v512_64( 1 ); v128_t s0 = casti_v128( src,0 ); v128_t s1 = casti_v128( src,1 ); @@ -2657,11 +2614,11 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) v128_t s3 = casti_v128( src,3 ); v128_t s4 = casti_v128( src,4 ); - s0 = _mm_shuffle_epi8( s0, bswap_shuf ); - s1 = _mm_shuffle_epi8( s1, bswap_shuf ); - s2 = _mm_shuffle_epi8( s2, bswap_shuf ); - s3 = _mm_shuffle_epi8( s3, bswap_shuf ); - s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + s0 = v128_bswap32( s0 ); + s1 = v128_bswap32( s1 ); + s2 = v128_bswap32( s2 ); + s3 = v128_bswap32( s3 ); + s4 = v128_bswap32( s4 ); casti_m512i( d,0 ) = _mm512_broadcastq_epi64( s0 ); casti_m512i( d,1 ) = _mm512_permutexvar_epi64( c1, @@ -2842,49 +2799,45 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2, static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src ) { - const __m512i bswap_shuf = mm512_bcast_m128( - _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); const v128_t s0 = casti_v128( src,0 ); const v128_t s1 = casti_v128( src,1 ); const v128_t s2 = casti_v128( src,2 ); const v128_t s3 = casti_v128( src,3 ); const v128_t s4 = casti_v128( src,4 ); - casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ), - bswap_shuf ); - casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ), - bswap_shuf ); - casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ), - bswap_shuf ); - casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ), - bswap_shuf ); - casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ), - bswap_shuf ); + casti_m512i( d,0 ) = _mm512_permutexvar_epi8( V512_BSWAP32, + _mm512_castsi128_si512( s0 ) ); + casti_m512i( d,1 ) = _mm512_permutexvar_epi8( V512_BSWAP32, + _mm512_castsi128_si512( s1 ) ); + casti_m512i( d,2 ) = _mm512_permutexvar_epi8( V512_BSWAP32, + _mm512_castsi128_si512( s2 ) ); + casti_m512i( d,3 ) = _mm512_permutexvar_epi8( V512_BSWAP32, + _mm512_castsi128_si512( s3 ) ); + casti_m512i( d,4 ) = _mm512_permutexvar_epi8( V512_BSWAP32, + _mm512_castsi128_si512( s4 ) ); } #else static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src ) { - const v128_t bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, - 0x0405060700010203 ); v128_t s0 = casti_v128( src,0 ); v128_t s1 = casti_v128( src,1 ); v128_t s2 = casti_v128( src,2 ); v128_t s3 = casti_v128( src,3 ); v128_t s4 = casti_v128( src,4 ); - s0 = _mm_shuffle_epi8( s0, bswap_shuf ); - s1 = _mm_shuffle_epi8( s1, bswap_shuf ); - s2 = _mm_shuffle_epi8( s2, bswap_shuf ); - s3 = _mm_shuffle_epi8( s3, bswap_shuf ); - s4 = _mm_shuffle_epi8( s4, bswap_shuf ); + s0 = v128_bswap32( s0 ); + s1 = v128_bswap32( s1 ); + s2 = v128_bswap32( s2 ); + s3 = v128_bswap32( s3 ); + s4 = v128_bswap32( s4 ); - casti_m512i( d,0 ) = mm512_bcast_m128( s0 ); - casti_m512i( d,1 ) = mm512_bcast_m128( s1 ); - casti_m512i( d,2 ) = mm512_bcast_m128( s2 ); - casti_m512i( d,3 ) = mm512_bcast_m128( s3 ); - casti_m512i( d,4 ) = mm512_bcast_m128( s4 ); + casti_m512i( d,0 ) = mm512_bcast128( s0 ); + casti_m512i( d,1 ) = mm512_bcast128( s1 ); + casti_m512i( d,2 ) = mm512_bcast128( s2 ); + casti_m512i( d,3 ) = mm512_bcast128( s3 ); + casti_m512i( d,4 ) = mm512_bcast128( s4 ); } #endif // AVX512VBMI ELSE diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 0774838..8a5f908 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -521,29 +521,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #elif defined(__SSSE3__) // SSSE3: fastest 32 bit, very fast 16, fast 8 -#define v128_shuflr64_8( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( \ - 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) - -#define v128_shufll64_8( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( \ - 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) - -#define v128_shuflr64_24( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( \ - 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) - -#define v128_shufll64_24( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( \ - 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) - -#define v128_shuflr32_8( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( \ - 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) - -#define v128_shufll32_8( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( \ - 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) +#define v128_shuflr64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR64_8 ) +#define v128_shufll64_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL64_8 ) +#define v128_shuflr64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLR64_24 ) +#define v128_shufll64_24(v ) _mm_shuffle_epi8( v, V128_SHUFLL64_24 ) +#define v128_shuflr32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLR32_8 ) +#define v128_shufll32_8( v ) _mm_shuffle_epi8( v, V128_SHUFLL32_8 ) #define v128_ror64( v, c ) \ ( (c) == 8 ) ? v128_shuflr64_8( v ) \ @@ -612,74 +595,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) // (v1 ^ v0) >>> n, ARM NEON has optimized version #define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n ) -/* not used -// x2 rotates elements in 2 individual vectors in a double buffered -// optimization for SSE2, does nothing for AVX512 but is there for -// transparency. - -#if defined(VL256) - -#define v128_2ror64( v1, v0, c ) \ - _mm_ror_epi64( v0, c ); \ - _mm_ror_epi64( v1, c ) - -#define v128_2rol64( v1, v0, c ) \ - _mm_rol_epi64( v0, c ); \ - _mm_rol_epi64( v1, c ) - -#define v128_2ror32( v1, v0, c ) \ - _mm_ror_epi32( v0, c ); \ - _mm_ror_epi32( v1, c ) - -#define v128_2rol32( v1, v0, c ) \ - _mm_rol_epi32( v0, c ); \ - _mm_rol_epi32( v1, c ) - -#else // SSE2 - -#define v128_2ror64( v1, v0, c ) \ -{ \ - __m128i t0 = _mm_srli_epi64( v0, c ); \ - __m128i t1 = _mm_srli_epi64( v1, c ); \ - v0 = _mm_slli_epi64( v0, 64-(c) ); \ - v1 = _mm_slli_epi64( v1, 64-(c) ); \ - v0 = _mm_or_si256( v0, t0 ); \ - v1 = _mm_or_si256( v1, t1 ); \ -} - -#define v128_2rol64( v1, v0, c ) \ -{ \ - __m128i t0 = _mm_slli_epi64( v0, c ); \ - __m128i t1 = _mm_slli_epi64( v1, c ); \ - v0 = _mm_srli_epi64( v0, 64-(c) ); \ - v1 = _mm_srli_epi64( v1, 64-(c) ); \ - v0 = _mm_or_si256( v0, t0 ); \ - v1 = _mm_or_si256( v1, t1 ); \ -} - -#define v128_2ror32( v1, v0, c ) \ -{ \ - __m128i t0 = _mm_srli_epi32( v0, c ); \ - __m128i t1 = _mm_srli_epi32( v1, c ); \ - v0 = _mm_slli_epi32( v0, 32-(c) ); \ - v1 = _mm_slli_epi32( v1, 32-(c) ); \ - v0 = _mm_or_si256( v0, t0 ); \ - v1 = _mm_or_si256( v1, t1 ); \ -} - -#define v128_2rol32( v1, v0, c ) \ -{ \ - __m128i t0 = _mm_slli_epi32( v0, c ); \ - __m128i t1 = _mm_slli_epi32( v1, c ); \ - v0 = _mm_srli_epi32( v0, 32-(c) ); \ - v1 = _mm_srli_epi32( v1, 32-(c) ); \ - v0 = _mm_or_si256( v0, t0 ); \ - v1 = _mm_or_si256( v1, t1 ); \ -} - -#endif // AVX512 else SSE2 -*/ - // Cross lane shuffles // No NEON version @@ -721,13 +636,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0001020304050607, \ 0x08090a0b0c0d0e0f ) ) -#define v128_bswap64( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( 0x08090a0b0c0d0e0f, \ - 0x0001020304050607 ) ) +#define v128_bswap64( v ) _mm_shuffle_epi8( v, V128_BSWAP64 ) + +#define v128_bswap32( v ) _mm_shuffle_epi8( v, V128_BSWAP32 ) -#define v128_bswap32( v ) \ - _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203 ) ) #define v128_bswap16( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \ 0x0607040502030001 ) @@ -735,85 +647,30 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) // 8 byte qword * 8 qwords * 2 lanes = 128 bytes #define v128_block_bswap64( d, s ) \ { \ - v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ - casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ - casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \ - casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \ - casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \ - casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \ - casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \ - casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \ - casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \ -} -#define v128_block_bswap64_512 v128_block_bswap64 - -#define v128_block_bswap64_1024( d, s ) \ -{ \ - v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ - casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \ - casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \ - casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \ - casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \ - casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \ - casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \ - casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \ - casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \ - casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \ - casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \ - casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \ - casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \ - casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \ - casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \ - casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \ - casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \ + casti_v128( d,0 ) = v128_bswap64( casti_v128( s,0 ) ); \ + casti_v128( d,1 ) = v128_bswap64( casti_v128( s,1 ) ); \ + casti_v128( d,2 ) = v128_bswap64( casti_v128( s,2 ) ); \ + casti_v128( d,3 ) = v128_bswap64( casti_v128( s,3 ) ); \ + casti_v128( d,4 ) = v128_bswap64( casti_v128( s,4 ) ); \ + casti_v128( d,5 ) = v128_bswap64( casti_v128( s,5 ) ); \ + casti_v128( d,6 ) = v128_bswap64( casti_v128( s,6 ) ); \ + casti_v128( d,7 ) = v128_bswap64( casti_v128( s,7 ) ); \ } // 4 byte dword * 8 dwords * 4 lanes = 128 bytes #define v128_block_bswap32( d, s ) \ { \ - v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ - casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \ - casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \ - casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \ - casti_v128( d,4 ) = _mm_shuffle_epi8( casti_v128( s,4 ), ctl ); \ - casti_v128( d,5 ) = _mm_shuffle_epi8( casti_v128( s,5 ), ctl ); \ - casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \ - casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \ + casti_v128( d,0 ) = v128_bswap32( casti_v128( s,0 ) ); \ + casti_v128( d,1 ) = v128_bswap32( casti_v128( s,1 ) ); \ + casti_v128( d,2 ) = v128_bswap32( casti_v128( s,2 ) ); \ + casti_v128( d,3 ) = v128_bswap32( casti_v128( s,3 ) ); \ + casti_v128( d,4 ) = v128_bswap32( casti_v128( s,4 ) ); \ + casti_v128( d,5 ) = v128_bswap32( casti_v128( s,5 ) ); \ + casti_v128( d,6 ) = v128_bswap32( casti_v128( s,6 ) ); \ + casti_v128( d,7 ) = v128_bswap32( casti_v128( s,7 ) ); \ } #define v128_block_bswap32_256 v128_block_bswap32 - -#define v128_block_bswap32_128( d, s ) \ -{ \ - v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ - casti_v128( d,1 ) = _mm_shuffle_epi8( casti_v128( s,1 ), ctl ); \ - casti_v128( d,2 ) = _mm_shuffle_epi8( casti_v128( s,2 ), ctl ); \ - casti_v128( d,3 ) = _mm_shuffle_epi8( casti_v128( s,3 ), ctl ); \ -} - -#define v128_block_bswap32_512( d, s ) \ -{ \ - v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ - casti_v128( d, 0 ) = _mm_shuffle_epi8( casti_v128( s, 0 ), ctl ); \ - casti_v128( d, 1 ) = _mm_shuffle_epi8( casti_v128( s, 1 ), ctl ); \ - casti_v128( d, 2 ) = _mm_shuffle_epi8( casti_v128( s, 2 ), ctl ); \ - casti_v128( d, 3 ) = _mm_shuffle_epi8( casti_v128( s, 3 ), ctl ); \ - casti_v128( d, 4 ) = _mm_shuffle_epi8( casti_v128( s, 4 ), ctl ); \ - casti_v128( d, 5 ) = _mm_shuffle_epi8( casti_v128( s, 5 ), ctl ); \ - casti_v128( d, 6 ) = _mm_shuffle_epi8( casti_v128( s, 6 ), ctl ); \ - casti_v128( d, 7 ) = _mm_shuffle_epi8( casti_v128( s, 7 ), ctl ); \ - casti_v128( d, 8 ) = _mm_shuffle_epi8( casti_v128( s, 8 ), ctl ); \ - casti_v128( d, 9 ) = _mm_shuffle_epi8( casti_v128( s, 9 ), ctl ); \ - casti_v128( d,10 ) = _mm_shuffle_epi8( casti_v128( s,10 ), ctl ); \ - casti_v128( d,11 ) = _mm_shuffle_epi8( casti_v128( s,11 ), ctl ); \ - casti_v128( d,12 ) = _mm_shuffle_epi8( casti_v128( s,12 ), ctl ); \ - casti_v128( d,13 ) = _mm_shuffle_epi8( casti_v128( s,13 ), ctl ); \ - casti_v128( d,14 ) = _mm_shuffle_epi8( casti_v128( s,14 ), ctl ); \ - casti_v128( d,15 ) = _mm_shuffle_epi8( casti_v128( s,15 ), ctl ); \ -} - #else // SSE2 static inline v128_t v128_bswap64( __m128i v ) @@ -835,7 +692,7 @@ static inline v128_t v128_bswap16( __m128i v ) return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); } -#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) ) +#define v128_bswap128( v ) v128_rev64( v128_bswap64( v ) ) static inline void v128_block_bswap64( __m128i *d, const __m128i *s ) { @@ -849,26 +706,6 @@ static inline void v128_block_bswap64( __m128i *d, const __m128i *s ) d[7] = v128_bswap64( s[7] ); } -static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s ) -{ - d[ 0] = v128_bswap64( s[ 0] ); - d[ 1] = v128_bswap64( s[ 1] ); - d[ 2] = v128_bswap64( s[ 2] ); - d[ 3] = v128_bswap64( s[ 3] ); - d[ 4] = v128_bswap64( s[ 4] ); - d[ 5] = v128_bswap64( s[ 5] ); - d[ 6] = v128_bswap64( s[ 6] ); - d[ 7] = v128_bswap64( s[ 7] ); - d[ 8] = v128_bswap64( s[ 8] ); - d[ 9] = v128_bswap64( s[ 9] ); - d[10] = v128_bswap64( s[10] ); - d[11] = v128_bswap64( s[11] ); - d[14] = v128_bswap64( s[12] ); - d[13] = v128_bswap64( s[13] ); - d[14] = v128_bswap64( s[14] ); - d[15] = v128_bswap64( s[15] ); -} - static inline void v128_block_bswap32( __m128i *d, const __m128i *s ) { d[0] = v128_bswap32( s[0] ); @@ -882,26 +719,6 @@ static inline void v128_block_bswap32( __m128i *d, const __m128i *s ) } #define v128_block_bswap32_256 v128_block_bswap32 -static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s ) -{ - d[ 0] = v128_bswap32( s[ 0] ); - d[ 1] = v128_bswap32( s[ 1] ); - d[ 2] = v128_bswap32( s[ 2] ); - d[ 3] = v128_bswap32( s[ 3] ); - d[ 4] = v128_bswap32( s[ 4] ); - d[ 5] = v128_bswap32( s[ 5] ); - d[ 6] = v128_bswap32( s[ 6] ); - d[ 7] = v128_bswap32( s[ 7] ); - d[ 8] = v128_bswap32( s[ 8] ); - d[ 9] = v128_bswap32( s[ 9] ); - d[10] = v128_bswap32( s[10] ); - d[11] = v128_bswap32( s[11] ); - d[12] = v128_bswap32( s[12] ); - d[13] = v128_bswap32( s[13] ); - d[14] = v128_bswap32( s[14] ); - d[15] = v128_bswap32( s[15] ); -} - #endif // SSSE3 else SSE2 // alignr instruction for 32 & 64 bit elements is only available with AVX512 diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 8646dbe..56d78ab 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -61,8 +61,10 @@ typedef union #if defined(__AVX2__) // Broadcast, ie set1, from 128 bit vector input. -#define mm256_bcast_m128( v ) \ +#define mm256_bcast128( v ) \ _mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 ) +// deprecated +#define mm256_bcast_m128 mm256_bcast128 // Set either the low or high 64 bit elements in 128 bit lanes, other elements // are set to zero. @@ -73,23 +75,23 @@ typedef union #else -#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) ) +#define mm256_bcast128lo_64( i64 ) mm256_bcast128( v128_mov64( i64 ) ) #define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \ _mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 ) #endif -#define mm256_set2_64( i1, i0 ) mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) ) +#define mm256_set2_64( i1, i0 ) mm256_bcast128( _mm_set_epi64x( i1, i0 ) ) #define mm256_set4_32( i3, i2, i1, i0 ) \ - mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) ) + mm256_bcast128( _mm_set_epi32( i3, i2, i1, i0 ) ) // All SIMD constant macros are actually functions containing executable // code and therefore can't be used as compile time initializers. #define m256_zero _mm256_setzero_si256() -#define m256_one_128 mm256_bcast_m128( v128_one ) +#define m256_one_128 mm256_bcast128( v128_one ) static inline __m256i mm256_neg1_fn() { @@ -231,21 +233,8 @@ static inline __m256i mm256_not( const __m256i v ) #define mm256_swap64_32 mm256_qrev32 // grandfathered #define mm256_qrev16(v) mm256_shuffle16( v, 0x1b ) - -#define mm256_qrev8(v) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - v128_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) ) - #define mm256_lrev16(v) mm256_shuffle16( v, 0xb1 ) -#define mm256_lrev8(v) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - v128_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) ) - -#define mm256_wrev8(v) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - v128_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) - // // Bit rotations. @@ -268,50 +257,33 @@ static inline __m256i mm256_not( const __m256i v ) #if defined(VL256) -#define mm256_ror_64 _mm256_ror_epi64 -#define mm256_rol_64 _mm256_rol_epi64 -#define mm256_ror_32 _mm256_ror_epi32 -#define mm256_rol_32 _mm256_rol_epi32 +#define mm256_ror_64 _mm256_ror_epi64 +#define mm256_rol_64 _mm256_rol_epi64 +#define mm256_ror_32 _mm256_ror_epi32 +#define mm256_rol_32 _mm256_rol_epi32 // Redundant but naming may be a better fit in some applications. -#define mm126_shuflr64_8( v) _mm256_ror_epi64( v, 8 ) -#define mm156_shufll64_8( v) _mm256_rol_epi64( v, 8 ) -#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 ) -#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 ) -#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 ) -#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 ) -#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 ) -#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 ) -#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 ) -#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 ) +#define mm256_shuflr64_8( v) _mm256_ror_epi64( v, 8 ) +#define mm256_shufll64_8( v) _mm256_rol_epi64( v, 8 ) +#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 ) +#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 ) +#define mm256_shuflr64_24(v) _mm256_ror_epi64( v, 24 ) +#define mm256_shufll64_24(v) _mm256_rol_epi64( v, 24 ) +#define mm256_shuflr32_8( v) _mm256_ror_epi32( v, 8 ) +#define mm256_shufll32_8( v) _mm256_rol_epi32( v, 8 ) +#define mm256_shuflr32_16(v) _mm256_ror_epi32( v, 16 ) +#define mm256_shufll32_16(v) _mm256_rol_epi32( v, 16 ) #else // ROR & ROL will always find the fastest but these names may be a better fit // in some applications. -#define mm256_shuflr64_8( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - _mm_set_epi64x( 0x080f0e0d0c0b0a09, 0x0007060504030201 ) ) ) - -#define mm256_shufll64_8( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - _mm_set_epi64x( 0x0e0d0c0b0a09080f, 0x0605040302010007 ) ) ) - -#define mm256_shuflr64_24( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - _mm_set_epi64x( 0x0a09080f0e0d0c0b, 0x0201000706050403 ) ) ) - -#define mm256_shufll64_24( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - _mm_set_epi64x( 0x0c0b0a09080f0e0d, 0x0403020100070605 ) ) ) - -#define mm256_shuflr32_8( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - _mm_set_epi64x( 0x0c0f0e0d080b0a09, 0x0407060500030201 ) ) ) - -#define mm256_shufll32_8( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( \ - _mm_set_epi64x( 0x0e0d0c0f0a09080b, 0x0605040702010003 ) ) ) +#define mm256_shuflr64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_8 ) +#define mm256_shufll64_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_8 ) +#define mm256_shuflr64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLR64_24 ) +#define mm256_shufll64_24(v ) _mm256_shuffle_epi8( v, V256_SHUFLL64_24 ) +#define mm256_shuflr32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLR32_8 ) +#define mm256_shufll32_8( v ) _mm256_shuffle_epi8( v, V256_SHUFLL32_8 ) #define mm256_ror_64( v, c ) \ ( (c) == 8 ) ? mm256_shuflr64_8( v ) \ @@ -347,96 +319,6 @@ static inline __m256i mm256_not( const __m256i v ) #endif -// -// x2 rotates elements in 2 individual vectors in a double buffered -// optimization for AVX2, does nothing for AVX512 but is here for -// transparency. - -#if defined(VL256) -/* -#define mm256_ror_64 _mm256_ror_epi64 -#define mm256_rol_64 _mm256_rol_epi64 -#define mm256_ror_32 _mm256_ror_epi32 -#define mm256_rol_32 _mm256_rol_epi32 -*/ -#define mm256_rorx2_64( v1, v0, c ) \ - _mm256_ror_epi64( v0, c ); \ - _mm256_ror_epi64( v1, c ) - -#define mm256_rolx2_64( v1, v0, c ) \ - _mm256_rol_epi64( v0, c ); \ - _mm256_rol_epi64( v1, c ) - -#define mm256_rorx2_32( v1, v0, c ) \ - _mm256_ror_epi32( v0, c ); \ - _mm256_ror_epi32( v1, c ) - -#define mm256_rolx2_32( v1, v0, c ) \ - _mm256_rol_epi32( v0, c ); \ - _mm256_rol_epi32( v1, c ) - -#else // AVX2 -/* -// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8. - -#define mm256_ror_64( v, c ) \ - _mm256_or_si256( _mm256_srli_epi64( v, c ), \ - _mm256_slli_epi64( v, 64-(c) ) ) - -#define mm256_rol_64( v, c ) \ - _mm256_or_si256( _mm256_slli_epi64( v, c ), \ - _mm256_srli_epi64( v, 64-(c) ) ) - -#define mm256_ror_32( v, c ) \ - _mm256_or_si256( _mm256_srli_epi32( v, c ), \ - _mm256_slli_epi32( v, 32-(c) ) ) - -#define mm256_rol_32( v, c ) \ - _mm256_or_si256( _mm256_slli_epi32( v, c ), \ - _mm256_srli_epi32( v, 32-(c) ) ) -*/ -#define mm256_rorx2_64( v1, v0, c ) \ -{ \ - __m256i t0 = _mm256_srli_epi64( v0, c ); \ - __m256i t1 = _mm256_srli_epi64( v1, c ); \ - v0 = _mm256_slli_epi64( v0, 64-(c) ); \ - v1 = _mm256_slli_epi64( v1, 64-(c) ); \ - v0 = _mm256_or_si256( v0, t0 ); \ - v1 = _mm256_or_si256( v1, t1 ); \ -} - -#define mm256_rolx2_64( v1, v0, c ) \ -{ \ - __m256i t0 = _mm256_slli_epi64( v0, c ); \ - __m256i t1 = _mm256_slli_epi64( v1, c ); \ - v0 = _mm256_srli_epi64( v0, 64-(c) ); \ - v1 = _mm256_srli_epi64( v1, 64-(c) ); \ - v0 = _mm256_or_si256( v0, t0 ); \ - v1 = _mm256_or_si256( v1, t1 ); \ -} - -#define mm256_rorx2_32( v1, v0, c ) \ -{ \ - __m256i t0 = _mm256_srli_epi32( v0, c ); \ - __m256i t1 = _mm256_srli_epi32( v1, c ); \ - v0 = _mm256_slli_epi32( v0, 32-(c) ); \ - v1 = _mm256_slli_epi32( v1, 32-(c) ); \ - v0 = _mm256_or_si256( v0, t0 ); \ - v1 = _mm256_or_si256( v1, t1 ); \ -} - -#define mm256_rolx2_32( v1, v0, c ) \ -{ \ - __m256i t0 = _mm256_slli_epi32( v0, c ); \ - __m256i t1 = _mm256_slli_epi32( v1, c ); \ - v0 = _mm256_srli_epi32( v0, 32-(c) ); \ - v1 = _mm256_srli_epi32( v1, 32-(c) ); \ - v0 = _mm256_or_si256( v0, t0 ); \ - v1 = _mm256_or_si256( v1, t1 ); \ -} - -#endif // AVX512 else AVX2 - #if defined(__AVX2__) // 128 bit version of unpack @@ -453,20 +335,14 @@ static inline __m256i mm256_not( const __m256i v ) // // Cross lane shuffles // -// Rotate elements accross all lanes. -#define mm256_shuffle_16( v, c ) \ - _mm256_or_si256( _mm256_shufflehi_epi16( v, c ), \ - _mm256_shufflelo_epi16( v, c ) ) // Swap 128 bit elements in 256 bit vector. -#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) #define mm256_rev_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) +#define mm256_swap_128 mm256_rev_128 // grandfathered -// Rotate 256 bit vector by one 64 bit element -#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 ) -#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 ) -// Reverse 64 bit elements +/* not used +// Reverse elements #define mm256_rev_64( v ) _mm256_permute4x64_epi64( v, 0x1b ) #define mm256_rev_32( v ) \ @@ -474,7 +350,12 @@ static inline __m256i mm256_not( const __m256i v ) 0x0000000400000005, 0x0000000600000007 ) #define mm256_rev_16( v ) \ - _mm256_permute4x64_epi64( mm256_shuffle_16( v, 0x1b ), 0x4e ) + _mm256_permute4x64_epi64( mm256_shuffle16( v, 0x1b ), 0x4e ) +*/ + +// Rotate 256 bit vector by one 64 bit element +#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 ) +#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 ) /* Not used // Rotate 256 bit vector by one 32 bit element. @@ -486,7 +367,7 @@ static inline __m256i mm256_shufll_32( const __m256i v ) #else #define mm256_shuflr_32( v ) \ _mm256_permutevar8x32_epi32( v, \ - _mm256_set_spi64x( 0x0000000000000007, 0x0000000600000005, \ + _mm256_set_epi64x( 0x0000000000000007, 0x0000000600000005, \ 0x0000000400000003, 0x0000000200000001 ) ) #define mm256_shufll_32( v ) \ _mm256_permutevar8x32_epi32( v, \ @@ -507,113 +388,64 @@ static inline __m256i mm256_shufll_32( const __m256i v ) _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \ _mm256_castsi256_ps( v2 ), c ) ); -#define mm256_swap128_64(v) _mm256_shuffle_epi32( v, 0x4e ) #define mm256_rev128_64(v) _mm256_shuffle_epi32( v, 0x4e ) +#define mm256_swap128_64 mm256_rev128_64 // grandfathered + +/*not used #define mm256_rev128_32(v) _mm256_shuffle_epi32( v, 0x1b ) -#define mm256_rev128_16(v) mm256_shuffle_16( v, 0x1b ) +#define mm256_rev128_16(v) mm256_shuffle16( v, 0x1b ) +*/ #define mm256_shuflr128_32(v) _mm256_shuffle_epi32( v, 0x39 ) #define mm256_shufll128_32(v) _mm256_shuffle_epi32( v, 0x93 ) -#define mm256_shuflr128_16(v) mm256_shuffle_16( v, 0x39 ) -#define mm256_shufll128_16(v) mm256_shuffle_16( v, 0x93 ) +/* not used +#define mm256_shuflr128_16(v) mm256_shuffle16( v, 0x39 ) +#define mm256_shufll128_16(v) mm256_shuffle16( v, 0x93 ) -/* Not used static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c ) { return _mm256_alignr_epi8( v, v, c ); } */ // Reverse byte order in elements, endian bswap. -#define mm256_bswap_64( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) ) +#define mm256_bswap_64( v ) _mm256_shuffle_epi8( v, V256_BSWAP64 ) -#define mm256_bswap_32( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) ) +#define mm256_bswap_32( v ) _mm256_shuffle_epi8( v, V256_BSWAP32 ) +/* not used #define mm256_bswap_16( v ) \ - _mm256_shuffle_epi8( v, mm256_bcast_m128( _mm_set_epi64x( \ + _mm256_shuffle_epi8( v, mm256_bcast128( _mm_set_epi64x( \ 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) -// +*/ // Source and destination are pointers, may point to same memory. // 8 byte qword * 8 qwords * 4 lanes = 256 bytes #define mm256_block_bswap_64( d, s ) \ { \ - __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \ - 0x0001020304050607 ) ); \ - casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ - casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ - casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ - casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \ - casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \ - casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \ - casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \ - casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \ -} -#define mm256_block_bswap64_512 mm256_block_bswap_64 - -#define mm256_block_bswap64_1024( d, s ) \ -{ \ - __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f, \ - 0x0001020304050607 ) ); \ - casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ - casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ - casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ - casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \ - casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \ - casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \ - casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \ - casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \ - casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \ - casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \ - casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \ - casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \ - casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \ - casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \ - casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \ - casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \ + casti_m256i( d,0 ) = mm256_bswap_64( casti_m256i( s,0 ) ); \ + casti_m256i( d,1 ) = mm256_bswap_64( casti_m256i( s,1 ) ); \ + casti_m256i( d,2 ) = mm256_bswap_64( casti_m256i( s,2 ) ); \ + casti_m256i( d,3 ) = mm256_bswap_64( casti_m256i( s,3 ) ); \ + casti_m256i( d,4 ) = mm256_bswap_64( casti_m256i( s,4 ) ); \ + casti_m256i( d,5 ) = mm256_bswap_64( casti_m256i( s,5 ) ); \ + casti_m256i( d,6 ) = mm256_bswap_64( casti_m256i( s,6 ) ); \ + casti_m256i( d,7 ) = mm256_bswap_64( casti_m256i( s,7 ) ); \ } // 4 byte dword * 8 dwords * 8 lanes = 256 bytes #define mm256_block_bswap_32( d, s ) \ { \ - __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203 ) ); \ - casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ - casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ - casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ - casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \ - casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \ - casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \ - casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \ - casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \ + casti_m256i( d, 0 ) = mm256_bswap_32( casti_m256i( s, 0 ) ); \ + casti_m256i( d, 1 ) = mm256_bswap_32( casti_m256i( s, 1 ) ); \ + casti_m256i( d, 2 ) = mm256_bswap_32( casti_m256i( s, 2 ) ); \ + casti_m256i( d, 3 ) = mm256_bswap_32( casti_m256i( s, 3 ) ); \ + casti_m256i( d, 4 ) = mm256_bswap_32( casti_m256i( s, 4 ) ); \ + casti_m256i( d, 5 ) = mm256_bswap_32( casti_m256i( s, 5 ) ); \ + casti_m256i( d, 6 ) = mm256_bswap_32( casti_m256i( s, 6 ) ); \ + casti_m256i( d, 7 ) = mm256_bswap_32( casti_m256i( s, 7 ) ); \ } #define mm256_block_bswap32_256 mm256_block_bswap_32 -#define mm256_block_bswap32_512( d, s ) \ -{ \ - __m256i ctl = mm256_bcast_m128( _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203 ) ); \ - casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ - casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ - casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ - casti_m256i( d, 3 ) = _mm256_shuffle_epi8( casti_m256i( s, 3 ), ctl ); \ - casti_m256i( d, 4 ) = _mm256_shuffle_epi8( casti_m256i( s, 4 ), ctl ); \ - casti_m256i( d, 5 ) = _mm256_shuffle_epi8( casti_m256i( s, 5 ), ctl ); \ - casti_m256i( d, 6 ) = _mm256_shuffle_epi8( casti_m256i( s, 6 ), ctl ); \ - casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \ - casti_m256i( d, 8 ) = _mm256_shuffle_epi8( casti_m256i( s, 8 ), ctl ); \ - casti_m256i( d, 9 ) = _mm256_shuffle_epi8( casti_m256i( s, 9 ), ctl ); \ - casti_m256i( d,10 ) = _mm256_shuffle_epi8( casti_m256i( s,10 ), ctl ); \ - casti_m256i( d,11 ) = _mm256_shuffle_epi8( casti_m256i( s,11 ), ctl ); \ - casti_m256i( d,12 ) = _mm256_shuffle_epi8( casti_m256i( s,12 ), ctl ); \ - casti_m256i( d,13 ) = _mm256_shuffle_epi8( casti_m256i( s,13 ), ctl ); \ - casti_m256i( d,14 ) = _mm256_shuffle_epi8( casti_m256i( s,14 ), ctl ); \ - casti_m256i( d,15 ) = _mm256_shuffle_epi8( casti_m256i( s,15 ), ctl ); \ -} - #if defined(VL256) #define mm256_alignr64 _mm256_alignr_epi64 diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 99e114b..629a41a 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -108,11 +108,13 @@ typedef union // A simple 128 bit permute, using function instead of macro avoids // problems if the v arg passed as an expression. -static inline __m512i mm512_perm_128( const __m512i v, const int c ) +static inline __m512i mm512_perm128( const __m512i v, const int c ) { return _mm512_shuffle_i64x2( v, v, c ); } // Broadcast 128 bit vector to all lanes of 512 bit vector. -#define mm512_bcast_m128( v ) mm512_perm_128( _mm512_castsi128_si512( v ), 0 ) +#define mm512_bcast128( v ) mm512_perm128( _mm512_castsi128_si512( v ), 0 ) +// deprecated +#define mm512_bcast_m128 mm512_bcast128 // Set either the low or high 64 bit elements in 128 bit lanes, other elements // are set to zero. @@ -120,7 +122,7 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c ) #define mm512_bcast128hi_64( i64 ) _mm512_maskz_set1_epi64( 0xaa, i64 ) #define mm512_set2_64( i1, i0 ) \ - mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) ) + mm512_bcast128( _mm_set_epi64x( i1, i0 ) ) // Pseudo constants. #define m512_zero _mm512_setzero_si512() @@ -248,105 +250,57 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // // Reverse byte order of packed elements, vectorized endian conversion. -#define mm512_bswap_64( v ) \ - _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) ) +#define mm512_bswap_64( v ) _mm512_shuffle_epi8( v, V512_BSWAP64 ) -#define mm512_bswap_32( v ) \ - _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) ) +#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 ) + +/* not used +#define mm512_bswap_16( v ) \ + _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \ + 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) +*/ #define mm512_bswap_16( v ) \ - _mm512_shuffle_epi8( v, mm512_bcast_m128( _mm_set_epi64x( \ - 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) // Source and destination are pointers, may point to same memory. // 8 lanes of 64 bytes each #define mm512_block_bswap_64( d, s ) \ { \ - const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \ - casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ - casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ - casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ - casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \ - casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \ - casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \ - casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \ - casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ -} -#define mm512_block_bswap64_512 mm512_block_bswap_64 - -#define mm512_block_bswap64_1024( d, s ) \ -{ \ - const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \ - casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ - casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ - casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ - casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \ - casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \ - casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \ - casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \ - casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ - casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \ - casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \ - casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \ - casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \ - casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \ - casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \ - casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \ - casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \ + casti_m512i( d, 0 ) = mm512_bswap_64( casti_m512i( s, 0 ) ); \ + casti_m512i( d, 1 ) = mm512_bswap_64( casti_m512i( s, 1 ) ); \ + casti_m512i( d, 2 ) = mm512_bswap_64( casti_m512i( s, 2 ) ); \ + casti_m512i( d, 3 ) = mm512_bswap_64( casti_m512i( s, 3 ) ); \ + casti_m512i( d, 4 ) = mm512_bswap_64( casti_m512i( s, 4 ) ); \ + casti_m512i( d, 5 ) = mm512_bswap_64( casti_m512i( s, 5 ) ); \ + casti_m512i( d, 6 ) = mm512_bswap_64( casti_m512i( s, 6 ) ); \ + casti_m512i( d, 7 ) = mm512_bswap_64( casti_m512i( s, 7 ) ); \ } // 16 lanes of 32 bytes each #define mm512_block_bswap_32( d, s ) \ { \ - const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \ - casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ - casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ - casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ - casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \ - casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \ - casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \ - casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \ - casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ + casti_m512i( d, 0 ) = mm512_bswap_32( casti_m512i( s, 0 ) ); \ + casti_m512i( d, 1 ) = mm512_bswap_32( casti_m512i( s, 1 ) ); \ + casti_m512i( d, 2 ) = mm512_bswap_32( casti_m512i( s, 2 ) ); \ + casti_m512i( d, 3 ) = mm512_bswap_32( casti_m512i( s, 3 ) ); \ + casti_m512i( d, 4 ) = mm512_bswap_32( casti_m512i( s, 4 ) ); \ + casti_m512i( d, 5 ) = mm512_bswap_32( casti_m512i( s, 5 ) ); \ + casti_m512i( d, 6 ) = mm512_bswap_32( casti_m512i( s, 6 ) ); \ + casti_m512i( d, 7 ) = mm512_bswap_32( casti_m512i( s, 7 ) ); \ } #define mm512_block_bswap32_256 mm512_block_bswap_32 -#define mm512_block_bswap32_512( d, s ) \ -{ \ - const __m512i ctl = mm512_bcast_m128( _mm_set_epi64x( \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \ - casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \ - casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \ - casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \ - casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \ - casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \ - casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \ - casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \ - casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \ - casti_m512i( d, 8 ) = _mm512_shuffle_epi8( casti_m512i( s, 8 ), ctl ); \ - casti_m512i( d, 9 ) = _mm512_shuffle_epi8( casti_m512i( s, 9 ), ctl ); \ - casti_m512i( d,10 ) = _mm512_shuffle_epi8( casti_m512i( s,10 ), ctl ); \ - casti_m512i( d,11 ) = _mm512_shuffle_epi8( casti_m512i( s,11 ), ctl ); \ - casti_m512i( d,12 ) = _mm512_shuffle_epi8( casti_m512i( s,12 ), ctl ); \ - casti_m512i( d,13 ) = _mm512_shuffle_epi8( casti_m512i( s,13 ), ctl ); \ - casti_m512i( d,14 ) = _mm512_shuffle_epi8( casti_m512i( s,14 ), ctl ); \ - casti_m512i( d,15 ) = _mm512_shuffle_epi8( casti_m512i( s,15 ), ctl ); \ -} - - - // Cross-lane shuffles implementing rotation of packed elements. // +// shuffle 16 bit elements within 64 bit lanes. +#define mm512_shuffle16( v, c ) \ + _mm512_shufflehi_epi16( _mm512_shufflelo_epi16( v, c ), c ) + // Rotate elements across entire vector. -static inline __m512i mm512_swap_256( const __m512i v ) +static inline __m512i mm512_rev_256( const __m512i v ) { return _mm512_alignr_epi64( v, v, 4 ); } -#define mm512_shuflr_256 mm512_swap_256 -#define mm512_shufll_256 mm512_swap_256 +#define mm512_swap_256 mm512_rev_256 // grandfathered static inline __m512i mm512_shuflr_128( const __m512i v ) { return _mm512_alignr_epi64( v, v, 2 ); } @@ -394,9 +348,8 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) // Rotate elements within 256 bit lanes of 512 bit vector. // Swap hi & lo 128 bits in each 256 bit lane -#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e ) -#define mm512_shuflr256_128 mm512_swap256_128 -#define mm512_shufll256_128 mm512_swap256_128 +#define mm512_rev256_128( v ) _mm512_permutex_epi64( v, 0x4e ) +#define mm512_swap256_128 mm512_rev256_128 // grandfathered // Rotate 256 bit lanes by one 64 bit element #define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 ) @@ -450,15 +403,23 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) // // Shuffle/rotate elements within 128 bit lanes of 512 bit vector. -#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) -#define mm512_shuflr128_64 mm512_swap128_64 -#define mm512_shufll128_64 mm512_swap128_64 +#define mm512_rev128_64( v ) _mm512_shuffle_epi32( v, 0x4e ) +#define mm512_swap128_64 mm512_rev128_64 // grandfathered + +/*not used +#define mm512_rev128_32(v) _mm526_shuffle_epi32( v, 0x1b ) +#define mm512_rev128_16(v) mm512_shuffle16( v, 0x1b ) +*/ // Rotate 128 bit lanes by one 32 bit element #define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 ) #define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 ) /* Not used + +#define mm512_shuflr128_16(v) mm512_shuffle16( v, 0x39 ) +#define mm512_shufll128_16(v) mm512_shuffle16( v, 0x93 ) + // Rotate 128 bit lanes right by c bytes, versatile and just as fast static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c ) { return _mm512_alignr_epi8( v, v, c ); } @@ -476,11 +437,10 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c ) _mm512_castsi512_ps( v2 ), c ) ); // 64 bit lanes -// Not really necessary with AVX512, included for consistency with AVX2/SSE. +// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE. -#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 ) -#define mm512_shuflr64_32 mm512_swap64_32 -#define mm512_shufll64_32 mm512_swap64_32 +#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 ) +#define mm512_swap64_32 mm512_qrev32 // grandfathered #define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 ) #define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 ) @@ -494,9 +454,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c ) /* Not used // 32 bit lanes -#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 ) -#define mm512_shuflr32_16 mm512_swap32_16 -#define mm512_shufll32_16 mm512_swap32_16 +#define mm512_lrev16( v ) _mm512_ror_epi32( v, 16 ) #define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 ) #define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 ) diff --git a/simd-utils/simd-constants.c b/simd-utils/simd-constants.c new file mode 100644 index 0000000..7c339e1 --- /dev/null +++ b/simd-utils/simd-constants.c @@ -0,0 +1,55 @@ +#include "simd-utils.h" + +#if defined(SIMD512) + +const __m512i V512_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f, + 0x0001020304050607, 0x08090a0b0c0d0e0f, + 0x0001020304050607, 0x08090a0b0c0d0e0f, + 0x0001020304050607, 0x08090a0b0c0d0e0f }; + +const __m512i V512_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b, + 0x0405060700010203, 0x0c0d0e0f08090a0b, + 0x0405060700010203, 0x0c0d0e0f08090a0b, + 0x0405060700010203, 0x0c0d0e0f08090a0b }; + +#elif defined(__AVX2__) + +const __m256i V256_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f, + 0x0001020304050607, 0x08090a0b0c0d0e0f }; + +const __m256i V256_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b, + 0x0405060700010203, 0x0c0d0e0f08090a0b }; + +const __m256i V256_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09, + 0x0007060504030201, 0x080f0e0d0c0b0a09 }; + +const __m256i V256_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b, + 0x0201000706050403, 0x0a09080f0e0d0c0b }; + +const __m256i V256_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f, + 0x0605040302010007, 0x0e0d0c0b0a09080f }; + +const __m256i V256_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d, + 0x0403020100070605, 0x0c0b0a09080f0e0d }; + +const __m256i V256_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09, + 0x0407060500030201, 0x0c0f0e0d080b0a09 }; + +const __m256i V256_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b, + 0x0605040702010003, 0x0e0d0c0f0a09080b }; + +#elif defined(__SSSE3__) + +const v128_t V128_BSWAP64 = { 0x0001020304050607, 0x08090a0b0c0d0e0f }; +const v128_t V128_BSWAP32 = { 0x0405060700010203, 0x0c0d0e0f08090a0b }; + +const v128_t V128_SHUFLR64_8 = { 0x0007060504030201, 0x080f0e0d0c0b0a09 }; +const v128_t V128_SHUFLR64_24 = { 0x0201000706050403, 0x0a09080f0e0d0c0b }; +const v128_t V128_SHUFLL64_8 = { 0x0605040302010007, 0x0e0d0c0b0a09080f }; +const v128_t V128_SHUFLL64_24 = { 0x0403020100070605, 0x0c0b0a09080f0e0d }; + +const v128_t V128_SHUFLR32_8 = { 0x0407060500030201, 0x0c0f0e0d080b0a09 }; +const v128_t V128_SHUFLL32_8 = { 0x0605040702010003, 0x0e0d0c0f0a09080b }; + +#endif + diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index 97e74ec..7f15323 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -14,10 +14,10 @@ // veor3q( v2, v1, v0 ) xor3 v2 ^ v1 ^ v0 // vxarq_u64( v1, v0, n ) ror64xor ( v1 ^ v0 ) >>> n ) // vbcaxq_u{64,32,16,8}( v2, v1, v0 ) xorandnot v2 ^ ( v1 & ~v0 ) +// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n ) // // not used anywhere yet -// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 ) -// vsraq_n_u{64,32,16,8}( v1, v0, n ) v1 + ( v0 >> n ) +// vrax1q_u64( v1, v0 ) v1 ^ ( v0 <<< 1 ) #define v128_t uint32x4_t // default, #define v128u64_t uint64x2_t @@ -124,7 +124,7 @@ // ~v1 & v0 #define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 ) -// ~( a ^ b ), same as (~a) ^ b +// ~( v1 ^ v0 ), same as (~v1) ^ v0 #define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) // ~v1 | v0, args reversed for consistency with x86_64 @@ -136,8 +136,11 @@ // known way to test arm minor version. #if defined(__ARM_FEATURE_SHA3) #define v128_xor3 veor3q_u32 + #define v128_xor4( v3, v2, v1, v0 ) veorq_u32( v3, veor3q_u32( v2, v1, v0 ) ) #else #define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 ) + #define v128_xor4( v3, v2, v1, v0 ) veorq_u32 ( veorq_u32( v3, v2 ), \ + veorq_u32( v1, v0 ) ) #endif // v2 & v1 & v0 @@ -153,13 +156,13 @@ #define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) ) #endif -// a ^ ( b & c ) +// v2 ^ ( v1 & v0 ) #define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) ) -// a & ( b ^ c ) +// v2 & ( v1 ^ v0 ) #define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) ) -// a ^ ( b | c ) +// v2 ^ ( v1 | v0 ) #define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) ) // v2 | ( v1 & v0 ) @@ -240,7 +243,7 @@ typedef union #define cast_v128u32( p ) (*((uint32x4_t*)(p))) #define castp_v128u32( p ) ((uint32x4_t*)(p)) -// set1 +// set1, integer argument #define v128_64 vmovq_n_u64 #define v128_32 vmovq_n_u32 #define v128_16 vmovq_n_u16 @@ -326,10 +329,59 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) } // how to build a bitmask from vector elements? Efficiently??? -#define v128_movmask32 -#define v128_movmask64 +//#define v128_movmask32 +//#define v128_movmask64 + +#define v128_shuffle8( v, vmask ) \ + vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) ) // Bit rotation +/* +#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 ) +#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 ) +#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 ) +#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 ) +#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 ) +#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 ) +#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 ) +#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 ) + +#define v128_ror64( v, c ) \ + ( (c) == 8 ) ? v128_shuflr64_8( v ) \ + : ( (c) == 16 ) ? v128_shuflr64_16( v ) \ + : ( (c) == 24 ) ? v128_shuflr64_24( v ) \ + : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ + : ( (c) == 40 ) ? v128_shufll64_24( v ) \ + : ( (c) == 48 ) ? v128_shufll64_16( v ) \ + : ( (c) == 56 ) ? v128_shufll64_8( v ) \ + : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ + ((uint64x2_t)(v)), c ) + +#define v128_rol64( v, c ) \ + ( (c) == 8 ) ? v128_shufll64_8( v ) \ + : ( (c) == 16 ) ? v128_shufll64_16( v ) \ + : ( (c) == 24 ) ? v128_shufll64_24( v ) \ + : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ + : ( (c) == 40 ) ? v128_shuflr64_24( v ) \ + : ( (c) == 48 ) ? v128_shuflr64_16( v ) \ + : ( (c) == 56 ) ? v128_shuflr64_8( v ) \ + : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ + ((uint64x2_t)(v)), c ) + +#define v128_ror32( v, c ) \ + ( (c) == 8 ) ? v128_shuflr32_8( v ) \ + : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ + : ( (c) == 24 ) ? v128_shufll32_8( v ) \ + : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ + ((uint32x4_t)(v)), c ) + +#define v128_rol32( v, c ) \ + ( (c) == 8 ) ? v128_shufll32_8( v ) \ + : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ + : ( (c) == 24 ) ? v128_shuflr32_8( v ) \ + : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ + ((uint32x4_t)(v)), c ) +*/ #define v128_ror64( v, c ) \ ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ @@ -351,6 +403,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ ((uint32x4_t)(v)), c ) +/* not used #define v128_ror16( v, c ) \ ( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \ : vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \ @@ -368,6 +421,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) #define v128_rol8( v, c ) \ vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \ ((uint8x16_t)(v)), c ) +*/ // ( v1 ^ v0 ) >>> c #if defined(__ARM_FEATURE_SHA3) @@ -376,57 +430,13 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) #define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c ) #endif -#define v128_2ror64( v1, v0, c ) \ -{ \ - uint64x2_t t0 = vshrq_n_u64( v0, c ); \ - uint64x2_t t1 = vshrq_n_u64( v1, c ); \ - v0 = vsliq_n_u64( v0, 64-(c) ); \ - v1 = vsliq_n_u64( v1, 64-(c) ); \ - v0 = vorrq_u64( v0, t0 ); \ - v1 = vorrq_u64( v1, t1 ); \ -} - -#define v128_2rol64_( v1, v0, c ) \ -{ \ - uint64x2_t t0 = vshlq_n_u64( v0, c ); \ - uint64x2_t t1 = vshlq_n_u64( v1, c ); \ - v0 = vsriq_n_u64( v0, 64-(c) ); \ - v1 = vsriq_n_u64( v1, 64-(c) ); \ - v0 = vorrq_u64( v0, t0 ); \ - v1 = vorrq_u64( v1, t1 ); \ -} - -#define v128_2rorl32( v1, v0, c ) \ -{ \ - uint32x4_t t0 = vshrq_n_u32( v0, c ); \ - uint32x4_t t1 = vshrq_n_u32( v1, c ); \ - v0 = vsliq_n_u32( v0, 32-(c) ); \ - v1 = vsliq_n_u32( v1, 32-(c) ); \ - v0 = vorrq_32( v0, t0 ); \ - v1 = vorrq_u32( v1, t1 ); \ -} - -#define v128_2ror32( v1, v0, c ) \ -{ \ - uint32x4_t t0 = vshlq_n_u32( v0, c ); \ - uint32x4_t t1 = vshlq_n_u32( v1, c ); \ - v0 = vsriq_n_u32( v0, 32-(c) ); \ - v1 = vsriq_n_u32( v1, 32-(c) ); \ - v0 = vorrq_u32( v0, t0 ); \ - v1 = vorrq_u32( v1, t1 ); \ -} - -/* not used anywhere and hopefully never will -// vector mask, use as last resort. prefer tbl, rev, alignr, etc -#define v128_shufflev32( v, vmask ) \ - v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \ - ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \ - ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \ - ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \ +/* not used +// v1 + ( v0 >> c ) +#define v128_addsr64( v1, v0, c ) vsraq_n_u64( v1, v0, c ) +#define v128_addsr32( v1, v0, c ) vsraq_n_u32( v1, v0, c ) */ -#define v128_shuffle8( v, vmask ) \ - vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) ) +// Cross lane shuffle // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster. // Bit rotation already promotes faster widths. Usage is context sensitive. @@ -438,19 +448,14 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) #define v128_qrev16 vrev64q_u16 #define v128_lrev16 vrev32q_u16 -// aka bswap -// #define v128_qrev8 vrev64q_u8 -// #define v128_lrev8 vrev32q_u8 -// #define v128_wrev8 vrev16q_u8 - // full vector rotation // reverse elements in vector static inline uint64x2_t v128_rev64( uint64x2_t v ) { return vextq_u64( v, v, 1 ); } -#define v128_swap64 v128_rev64 // grandfathered +#define v128_swap64 v128_rev64 // grandfathered -#define v128_rev32(v) v128_rev64( v128_qrev32( v ) ) +#define v128_rev32(v) v128_rev64( v128_qrev32( v ) ) // shuffle-rotate vector elements static inline uint32x4_t v128_shuflr32( uint32x4_t v ) @@ -468,7 +473,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v ) #define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) ) #define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) ) -// Useful for x86_64 but does nothing for ARM #define v128_block_bswap32( dst, src ) \ { \ casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \ @@ -482,26 +486,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v ) } #define v128_block_bswap32_256 v128_block_bswap32 -#define v128_block_bswap32_512( dst, src ) \ -{ \ - casti_v128u32( dst, 0 ) = v128_bswap32( casti_v128u32( src, 0 ) ); \ - casti_v128u32( dst, 1 ) = v128_bswap32( casti_v128u32( src, 1 ) ); \ - casti_v128u32( dst, 2 ) = v128_bswap32( casti_v128u32( src, 2 ) ); \ - casti_v128u32( dst, 3 ) = v128_bswap32( casti_v128u32( src, 3 ) ); \ - casti_v128u32( dst, 4 ) = v128_bswap32( casti_v128u32( src, 4 ) ); \ - casti_v128u32( dst, 5 ) = v128_bswap32( casti_v128u32( src, 5 ) ); \ - casti_v128u32( dst, 6 ) = v128_bswap32( casti_v128u32( src, 6 ) ); \ - casti_v128u32( dst, 7 ) = v128_bswap32( casti_v128u32( src, 7 ) ); \ - casti_v128u32( dst, 8 ) = v128_bswap32( casti_v128u32( src, 8 ) ); \ - casti_v128u32( dst, 9 ) = v128_bswap32( casti_v128u32( src, 9 ) ); \ - casti_v128u32( dst,10 ) = v128_bswap32( casti_v128u32( src,10 ) ); \ - casti_v128u32( dst,11 ) = v128_bswap32( casti_v128u32( src,11 ) ); \ - casti_v128u32( dst,12 ) = v128_bswap32( casti_v128u32( src,12 ) ); \ - casti_v128u32( dst,13 ) = v128_bswap32( casti_v128u32( src,13 ) ); \ - casti_v128u32( dst,14 ) = v128_bswap32( casti_v128u32( src,14 ) ); \ - casti_v128u32( dst,15 ) = v128_bswap32( casti_v128u32( src,15 ) ); \ -} - #define v128_block_bswap64( dst, src ) \ { \ casti_v128u64( dst,0 ) = v128_bswap64( casti_v128u64( src,0 ) ); \ @@ -513,27 +497,6 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v ) casti_v128u64( dst,6 ) = v128_bswap64( casti_v128u64( src,6 ) ); \ casti_v128u64( dst,7 ) = v128_bswap64( casti_v128u64( src,7 ) ); \ } -#define v128_block_bswap64_512 v128_block_bswap64 \ - -#define v128_block_bswap64_1024( dst, src ) \ -{ \ - casti_v128u64( dst, 0 ) = v128_bswap64( casti_v128u64( src, 0 ) ); \ - casti_v128u64( dst, 1 ) = v128_bswap64( casti_v128u64( src, 1 ) ); \ - casti_v128u64( dst, 2 ) = v128_bswap64( casti_v128u64( src, 2 ) ); \ - casti_v128u64( dst, 3 ) = v128_bswap64( casti_v128u64( src, 3 ) ); \ - casti_v128u64( dst, 4 ) = v128_bswap64( casti_v128u64( src, 4 ) ); \ - casti_v128u64( dst, 5 ) = v128_bswap64( casti_v128u64( src, 5 ) ); \ - casti_v128u64( dst, 6 ) = v128_bswap64( casti_v128u64( src, 6 ) ); \ - casti_v128u64( dst, 7 ) = v128_bswap64( casti_v128u64( src, 7 ) ); \ - casti_v128u64( dst, 8 ) = v128_bswap64( casti_v128u64( src, 8 ) ); \ - casti_v128u64( dst, 9 ) = v128_bswap64( casti_v128u64( src, 9 ) ); \ - casti_v128u64( dst,10 ) = v128_bswap64( casti_v128u64( src,10 ) ); \ - casti_v128u64( dst,11 ) = v128_bswap64( casti_v128u64( src,11 ) ); \ - casti_v128u64( dst,12 ) = v128_bswap64( casti_v128u64( src,12 ) ); \ - casti_v128u64( dst,13 ) = v128_bswap64( casti_v128u64( src,13 ) ); \ - casti_v128u64( dst,14 ) = v128_bswap64( casti_v128u64( src,14 ) ); \ - casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \ -} // Bitwise blend using vector mask, use only bytewise for compatibility // with x86_64. diff --git a/sysinfos.c b/sysinfos.c index af23258..dc1edf3 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -173,21 +173,13 @@ static inline int cpu_fanpercent() return 0; } +#if defined(__x86_64__) // x86_64 CPUID // This list is incomplete, it only contains features of interest to cpuminer. // refer to http://en.wikipedia.org/wiki/CPUID for details. -// AVX10 compatibility notes -// -// Display format: AVX10.[version]-[vectorwidth] -// AVX10.1-512 is a rebranding of AVX512 and is effectively the AVX* superset -// with full 512 bit vector support. -// AVX10.2-256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and -// features applied only to 256 bit and 128 bit vectors. -// Future AVX10 versions will add new instructions and features. - // Register array indexes #define EAX_Reg (0) #define EBX_Reg (1) @@ -209,6 +201,7 @@ static inline int cpu_fanpercent() // CPU_INFO: EAX=1, ECX=0 // ECX #define SSE3_Flag 1 +#define PCLMULQDQ_Flag (1<< 1) #define SSSE3_Flag (1<< 9) #define XOP_Flag (1<<11) // obsolete #define FMA3_Flag (1<<12) @@ -239,6 +232,7 @@ static inline int cpu_fanpercent() #define AVX512_VBMI_Flag (1<< 1) #define AVX512_VBMI2_Flag (1<< 6) #define VAES_Flag (1<< 9) +#define VPCLMULQDQ_Flag (1<<10) #define AVX512_VNNI_Flag (1<<11) #define AVX512_BITALG_Flag (1<<12) #define AVX512_VPOPCNTDQ_Flag (1<<14) @@ -260,6 +254,8 @@ static inline int cpu_fanpercent() #define AVX512_BF16_Flag (1<< 5) #define AMX_FP16_Flag (1<<21) #define AVX_IFMA_Flag (1<<23) +#define MOVRS_Flag (1<<31) // Both names are referenced in docs +#define AVX10_MOVRS_Flag (1<<31) // EDX #define AVX_VNNI_INT8_Flag (1<< 4) #define AVX_NE_CONVERT_Flag (1<< 5) @@ -271,17 +267,15 @@ static inline int cpu_fanpercent() // AVX10_FEATURES: EAX=0x24, ECX=0 // EBX #define AVX10_VERSION_mask 0xff // bits [7:0] -#define AVX10_128_Flag (1<<16) -#define AVX10_256_Flag (1<<17) -#define AVX10_512_Flag (1<<18) +//#define AVX10_128_Flag (1<<16) +//#define AVX10_256_Flag (1<<17) +//#define AVX10_512_Flag (1<<18) // Use this to detect presence of feature #define AVX_mask (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag) #define FMA3_mask (FMA3_Flag|AVX_mask) #define AVX512_mask (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag) -#if defined(__x86_64__) - static inline void cpuid( unsigned int leaf, unsigned int subleaf, unsigned int output[4] ) { @@ -317,7 +311,7 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf, #elif defined(ARM_AUXV) // Always test if HWCAP variable is defined in the kernel before attempting -// to compile it. If not defined the feature can't be tested and won't be +// to compile this. If not defined the feature can't be tested and won't be // included in the compile. // This can occur if compiling with an old kernel and a new CPU and could // result in a suboptimal build. @@ -543,6 +537,15 @@ static inline bool cpu_arch_aarch64() #endif } +static inline bool cpu_arch_riscv64() +{ +#if defined(__riscv) && ( __riscv_xlen == 64 ) + return true; +#else + return false; +#endif +} + static inline bool has_sse() { #if defined(__x86_64__) @@ -608,6 +611,16 @@ static inline bool has_neon() #endif } +// No apparent CPUID equivalent on riscv, returns SW build info. +static inline bool has_rvv() +{ +#if defined(__riscv) && defined(__riscv_vector) + return true; +#else + return false; +#endif +} + static inline bool has_avx() { #if defined(__x86_64__) @@ -897,7 +910,6 @@ static inline bool has_apx_f() #endif } -// Not much use on it's own static inline bool has_avx10() { #if defined(__x86_64__) @@ -922,49 +934,6 @@ static inline unsigned int avx10_version() return 0; } -// also includes 256 & 128 -static inline bool has_avx10_512() -{ -#if defined(__x86_64__) - if ( has_avx10() ) - { - unsigned int cpu_info[4] = { 0 }; - cpuid( AVX10_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX10_512_Flag; - } -#endif - return false; -} - -// Includes 128 but might not include 512 -static inline bool has_avx10_256() -{ -#if defined(__x86_64__) - if ( has_avx10() ) - { - unsigned int cpu_info[4] = { 0 }; - cpuid( AVX10_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX10_256_Flag; - } -#endif - return false; -} - -// AVX10 vector register length -static inline unsigned int avx10_vector_length() -{ -#if defined(__x86_64__) - if ( has_avx10() ) - { - unsigned int cpu_info[4] = { 0 }; - cpuid( AVX10_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512 - : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 ); - } -#endif - return 0; -} - // ARM SVE vector register length, converted from bytes to bits. static inline int sve_vector_length() { @@ -975,6 +944,33 @@ static inline int sve_vector_length() return 0; } +// Assume min_vlen refers to the register size +static inline int rvv_vector_length() +{ +#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen) + return __riscv_v_min_vlen; +#endif + return 0; +} + +// generic +static inline int vector_length() +{ +#if defined(__x86_64__) + return has_avx10() || has_avx512() ? 512 + : has_avx2() ? 256 + : has_sse2() ? 128 + : 0; +#elif defined(__aarch64__) + return has_sve() ? sve_vector_length() + : has_neon() ? 128 + : 0; +#elif defined(__riscv) && defined(__riscv_vector) + return rvv_vector_length(); +#endif + return 0; +} + static inline uint32_t cpuid_get_highest_function_number() { #if defined(__x86_64__) @@ -1061,13 +1057,17 @@ static inline void cpu_brand_string( char* s ) memcpy( s + 32, cpu_info, sizeof(cpu_info) ); } -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__aarch64__) sprintf( s, "ARM 64 bit CPU" ); +#elif defined(__riscv) && (__riscv_xlen == 64) + + sprintf( s, "RISC-V 64 bit CPU" ); + #else - sprintf( s, "unknown CPU architecture" ); + sprintf( s, "unknown/unsupported CPU architecture" ); #endif }