diff --git a/Makefile.am b/Makefile.am index c84bddb..38219f5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -51,12 +51,15 @@ cpuminer_SOURCES = \ algo/blake/blake.c \ algo/blake/blake-4way.c \ algo/blake/sph_blake2b.c \ - algo/blake/blake2b.c \ algo/blake/sph-blake2s.c \ algo/blake/blake2s-hash-4way.c \ algo/blake/blake2s.c \ algo/blake/blake2s-gate.c \ algo/blake/blake2s-4way.c \ + algo/blake/blake2b-hash-4way.c \ + algo/blake/blake2b.c \ + algo/blake/blake2b-gate.c \ + algo/blake/blake2b-4way.c \ algo/blake/blakecoin-gate.c \ algo/blake/mod_blakecoin.c \ algo/blake/blakecoin.c \ @@ -169,7 +172,8 @@ cpuminer_SOURCES = \ algo/scryptjane/scrypt-jane.c \ algo/sha/sph_sha2.c \ algo/sha/sph_sha2big.c \ - algo/sha/sha2-hash-4way.c \ + algo/sha/sha256-hash-4way.c \ + algo/sha/sha512-hash-4way.c \ algo/sha/sha256_hash_11way.c \ algo/sha/sha2.c \ algo/sha/sha256t-gate.c \ diff --git a/README.md b/README.md index eba3b86..29e39b0 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,9 @@ Supported Algorithms axiom Shabal-256 MemoHash bastion blake Blake-256 (SFR) - blakecoin blake256r8 + blake2b Blake2b 256 blake2s Blake-2 S + blakecoin blake256r8 bmw BMW 256 bmw512 BMW 512 c11 Chaincoin diff --git a/RELEASE_NOTES b/RELEASE_NOTES index e63debf..35aa7a7 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -38,6 +38,13 @@ supported. Change Log ---------- +v3.9.6.2 + +New algo blake2b. +Faster myr-gr on Ryzen using SHA. +Faster blake2s SSE2. +Small speedup of around 1% for several other algos. + v3.9.6.1 New algos: x21s, hex (alias x16r-hex). diff --git a/algo-gate-api.c b/algo-gate-api.c index e1ce066..72783e4 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -167,9 +167,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_AXIOM: register_axiom_algo ( gate ); break; case ALGO_BASTION: register_bastion_algo ( gate ); break; case ALGO_BLAKE: register_blake_algo ( gate ); break; - case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break; -// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break; + case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break; case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break; + case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break; case ALGO_BMW512: register_bmw512_algo ( gate ); break; case ALGO_C11: register_c11_algo ( gate ); break; case ALGO_CRYPTOLIGHT: register_cryptolight_algo ( gate ); break; diff --git a/algo/argon2/argon2d/argon2d/core.c b/algo/argon2/argon2d/argon2d/core.c index e222648..08c65d0 100644 --- a/algo/argon2/argon2d/argon2d/core.c +++ b/algo/argon2/argon2d/argon2d/core.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "core.h" #include "argon2d_thread.h" @@ -99,7 +100,8 @@ int allocate_memory(const argon2_context *context, uint8_t **memory, if (context->allocate_cbk) { (context->allocate_cbk)(memory, memory_size); } else { - *memory = malloc(memory_size); + *memory = _mm_malloc( memory_size, 64 ); +// *memory = malloc(memory_size); } if (*memory == NULL) { @@ -116,7 +118,8 @@ void free_memory(const argon2_context *context, uint8_t *memory, if (context->free_cbk) { (context->free_cbk)(memory, memory_size); } else { - free(memory); +// free(memory); + _mm_free( memory ); } } diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c index 87ff4cc..ba0e87b 100644 --- a/algo/argon2/argon2d/argon2d/opt.c +++ b/algo/argon2/argon2d/argon2d/opt.c @@ -96,14 +96,14 @@ static void fill_block(__m256i *state, const block *ref_block, if (with_xor) { for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { state[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); + state[i], _mm256_load_si256((const __m256i *)ref_block->v + i)); block_XY[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i)); + state[i], _mm256_load_si256((const __m256i *)next_block->v + i)); } } else { for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { block_XY[i] = state[i] = _mm256_xor_si256( - state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i)); + state[i], _mm256_load_si256((const __m256i *)ref_block->v + i)); } } @@ -139,7 +139,7 @@ static void fill_block(__m256i *state, const block *ref_block, for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { state[i] = _mm256_xor_si256(state[i], block_XY[i]); - _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]); + _mm256_store_si256((__m256i *)next_block->v + i, state[i]); } } diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h index 2c8942e..f66c2cf 100644 --- a/algo/argon2/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h @@ -29,6 +29,8 @@ #include #endif +#include "simd-utils.h" + #if !defined(__AVX512F__) #if !defined(__AVX2__) #if !defined(__XOP__) @@ -182,64 +184,63 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { #include -#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) -#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) -#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) -#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) +#define rotr32 mm256_swap32_64 +#define rotr24 mm256_ror3x8_64 +#define rotr16 mm256_ror1x16_64 +#define rotr63( x ) mm256_rol_64( x, 1 ) + +//#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) +//#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) +//#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) +//#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ do { \ - __m256i ml = _mm256_mul_epu32(A0, B0); \ - ml = _mm256_add_epi64(ml, ml); \ - A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + __m256i ml0, ml1; \ + ml0 = _mm256_mul_epu32(A0, B0); \ + ml1 = _mm256_mul_epu32(A1, B1); \ + ml0 = _mm256_add_epi64(ml0, ml0); \ + ml1 = _mm256_add_epi64(ml1, ml1); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \ D0 = _mm256_xor_si256(D0, A0); \ - D0 = rotr32(D0); \ - \ - ml = _mm256_mul_epu32(C0, D0); \ - ml = _mm256_add_epi64(ml, ml); \ - C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ - \ - B0 = _mm256_xor_si256(B0, C0); \ - B0 = rotr24(B0); \ - \ - ml = _mm256_mul_epu32(A1, B1); \ - ml = _mm256_add_epi64(ml, ml); \ - A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ D1 = _mm256_xor_si256(D1, A1); \ + D0 = rotr32(D0); \ D1 = rotr32(D1); \ - \ - ml = _mm256_mul_epu32(C1, D1); \ - ml = _mm256_add_epi64(ml, ml); \ - C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ - \ + ml0 = _mm256_mul_epu32(C0, D0); \ + ml1 = _mm256_mul_epu32(C1, D1); \ + ml0 = _mm256_add_epi64(ml0, ml0); \ + ml1 = _mm256_add_epi64(ml1, ml1); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \ + B0 = _mm256_xor_si256(B0, C0); \ B1 = _mm256_xor_si256(B1, C1); \ + B0 = rotr24(B0); \ B1 = rotr24(B1); \ } while((void)0, 0); #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ do { \ - __m256i ml = _mm256_mul_epu32(A0, B0); \ - ml = _mm256_add_epi64(ml, ml); \ - A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ + __m256i ml0, ml1; \ + ml0 = _mm256_mul_epu32(A0, B0); \ + ml1 = _mm256_mul_epu32(A1, B1); \ + ml0 = _mm256_add_epi64(ml0, ml0); \ + ml1 = _mm256_add_epi64(ml1, ml1); \ + A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \ + A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \ D0 = _mm256_xor_si256(D0, A0); \ - D0 = rotr16(D0); \ - \ - ml = _mm256_mul_epu32(C0, D0); \ - ml = _mm256_add_epi64(ml, ml); \ - C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ - B0 = _mm256_xor_si256(B0, C0); \ - B0 = rotr63(B0); \ - \ - ml = _mm256_mul_epu32(A1, B1); \ - ml = _mm256_add_epi64(ml, ml); \ - A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ D1 = _mm256_xor_si256(D1, A1); \ + D0 = rotr16(D0); \ D1 = rotr16(D1); \ - \ - ml = _mm256_mul_epu32(C1, D1); \ - ml = _mm256_add_epi64(ml, ml); \ - C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ + ml0 = _mm256_mul_epu32(C0, D0); \ + ml1 = _mm256_mul_epu32(C1, D1); \ + ml0 = _mm256_add_epi64(ml0, ml0); \ + ml1 = _mm256_add_epi64(ml1, ml1); \ + C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \ + C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \ + B0 = _mm256_xor_si256(B0, C0); \ B1 = _mm256_xor_si256(B1, C1); \ + B0 = rotr63(B0); \ B1 = rotr63(B1); \ } while((void)0, 0); @@ -259,16 +260,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - \ tmp1 = C0; \ + B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ C0 = C1; \ - C1 = tmp1; \ - \ - tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + C1 = tmp1; \ + tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ } while(0); #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -287,16 +286,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - \ tmp1 = C0; \ + B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ C0 = C1; \ - C1 = tmp1; \ - \ - tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + C1 = tmp1; \ + tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ + D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ } while((void)0, 0); #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c index b228e07..63cee58 100644 --- a/algo/blake/blake256-hash-4way.c +++ b/algo/blake/blake256-hash-4way.c @@ -308,12 +308,12 @@ static const sph_u32 CS[16] = { #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \ do { \ a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \ - _mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \ + _mm_set1_epi32( c1 ), m0 ), b ), a ); \ d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \ - _mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \ + _mm_set1_epi32( c0 ), m1 ), b ), a ); \ d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \ c = _mm_add_epi32( c, d ); \ b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ @@ -508,14 +508,18 @@ do { \ V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \ - V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \ - VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \ - VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \ - VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \ - VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \ - VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \ - VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \ + V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \ + V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \ + VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \ + VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \ + VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \ + m128_const1_64( 0xA4093822A4093822 ) ); \ + VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \ + m128_const1_64( 0x299F31D0299F31D0 ) ); \ + VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \ + m128_const1_64( 0x082EFA98082EFA98 ) ); \ + VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \ + m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \ BLAKE256_4WAY_BLOCK_BSWAP32; \ ROUND_S_4WAY(0); \ ROUND_S_4WAY(1); \ @@ -631,16 +635,20 @@ do { \ V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \ - V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \ - VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \ - VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \ - VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \ - VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \ - VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \ - VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \ - shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \ - 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ + V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \ + V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \ + VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \ + VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \ + VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\ + m256_const1_64( 0xA4093822A4093822 ) ); \ + VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\ + m256_const1_64( 0x299F31D0299F31D0 ) ); \ + VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ + m256_const1_64( 0x082EFA98082EFA98 ) ); \ + VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ + m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \ + shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \ + 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ M0 = _mm256_shuffle_epi8( * buf , shuf_bswap32 ); \ M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \ M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \ @@ -696,14 +704,14 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, const uint32_t *salt, int rounds ) { __m128i zero = m128_zero; - casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] ); - casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] ); - casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] ); - casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] ); - casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] ); - casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] ); - casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] ); - casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] ); + casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 ); + casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 ); + casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 ); + casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A ); + casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F ); + casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C ); + casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB ); + casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 ); casti_m128i( ctx->S, 0 ) = zero; casti_m128i( ctx->S, 1 ) = zero; @@ -778,12 +786,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n, else ctx->T0 -= 512 - bit_len; - buf[vptr] = _mm_set1_epi32( 0x80 ); + buf[vptr] = m128_const1_64( 0x0000008000000080 ); if ( vptr < 12 ) { memset_zero_128( buf + vptr + 1, 13 - vptr ); - buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) ); + buf[ 13 ] = _mm_or_si128( buf[ 13 ], + m128_const1_64( 0x0100000001000000ULL ) ); buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) ); buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) ); blake32_4way( ctx, buf + vptr, 64 - ptr ); @@ -795,7 +804,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n, ctx->T0 = 0xFFFFFE00UL; ctx->T1 = 0xFFFFFFFFUL; memset_zero_128( buf, 56>>2 ); - buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) ); + buf[ 13 ] = _mm_or_si128( buf[ 13 ], + m128_const1_64( 0x0100000001000000ULL ) ); buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) ); buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) ); blake32_4way( ctx, buf, 64 ); @@ -815,20 +825,18 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv, const sph_u32 *salt, int rounds ) { __m256i zero = m256_zero; - casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] ); - casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] ); - casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] ); - casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] ); - casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] ); - casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] ); - casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] ); - casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] ); - + casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 ); + casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 ); + casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 ); + casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A ); + casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F ); + casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C ); + casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB ); + casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 ); casti_m256i( sc->S, 0 ) = zero; casti_m256i( sc->S, 1 ) = zero; casti_m256i( sc->S, 2 ) = zero; casti_m256i( sc->S, 3 ) = zero; - sc->T0 = sc->T1 = 0; sc->ptr = 0; sc->rounds = rounds; @@ -887,7 +895,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, ptr = sc->ptr; bit_len = ((unsigned)ptr << 3); - buf[ptr>>2] = _mm256_set1_epi32( 0x80 ); + buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL ); tl = sc->T0 + bit_len; th = sc->T1; @@ -909,7 +917,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 ); if ( out_size_w32 == 8 ) buf[52>>2] = _mm256_or_si256( buf[52>>2], - _mm256_set1_epi32( 0x01000000UL ) ); + m256_const1_64( 0x0100000001000000ULL ) ); *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); blake32_8way( sc, buf + (ptr>>2), 64 - ptr ); @@ -922,7 +930,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, sc->T1 = SPH_C32(0xFFFFFFFFUL); memset_zero_256( buf, 56>>2 ); if ( out_size_w32 == 8 ) - buf[52>>2] = _mm256_set1_epi32( 0x01000000UL ); + buf[52>>2] = m256_const1_64( 0x0100000001000000ULL ); *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); blake32_8way( sc, buf, 64 ); diff --git a/algo/blake/blake256-hash-4way.c.new b/algo/blake/blake256-hash-4way.c.new deleted file mode 100644 index 683c84c..0000000 --- a/algo/blake/blake256-hash-4way.c.new +++ /dev/null @@ -1,322 +0,0 @@ -// convert blake256 32 bit to use 64 bit with serial vectoring -// -// cut calls to GS in half -// -// combine V -// v0 = {V0,V1} -// v1 = {V2,V3} -// v2 = {V4,V5} -// v3 = {V6,V7} -// v4 = {V8,V9} -// v5 = {VA,VB} -// v6 = {VC,VD} -// v7 = {CE,VF} -// -// v6x = {VD,VC} swap(VC,VD) swap(v6) -// v7x = {VF,VE} swap(VE,VF) swap(v7) -// -// V0 = v1v0 -// V1 = v3v2 -// V2 = v5v4 -// V3 = v7v6 -// V4 = v9v8 -// V5 = vbva -// V6 = vdvc -// V7 = vfve -// -// The rotate in ROUND is to effect straddle and unstraddle for the third -// and 4th iteration of GS. -// It concatenates 2 contiguous 256 bit vectors and extracts the middle -// 256 bits. After the transform they must be restored with only the -// chosen bits modified in the original 2 vectors. -// ror1x128 achieves this by putting the chosen bits in arg1, the "low" -// 256 bit vector and saves the untouched bits temporailly in arg0, the -// "high" 256 bit vector. Simply reverse the process to restore data back -// to original positions. - -// Use standard 4way when AVX2 is not available use x2 mode with AVX2. -// -// Data is organised the same as 32 bit 4 way, in effect serial vectoring -// on top of parallel vectoring. Same data in the same place just taking -// two chunks at a time. -// -// Transparent to user, x2 mode used when AVX2 detected. -// Use existing 4way context but revert to scalar types. -// Same interleave function (128 bit) or x2 with 256 bit? -// User trsnaparency would have to apply to interleave as well. -// -// Use common 4way update and close - -/* -typedef struct { - unsigned char buf[64<<2]; - uint32_t H[8<<2]; - uint32_t S[4<<2]; - size_t ptr; - uint32_t T0, T1; - int rounds; // 14 for blake, 8 for blakecoin & vanilla -} blakex2_4way_small_context __attribute__ ((aligned (64))); -*/ - -static void -blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, - const uint32_t *salt, int rounds ) -{ - casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] ); - casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] ); - casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] ); - casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] ); - casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] ); - casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] ); - casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] ); - casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] ); - - casti_m128i( ctx->S, 0 ) = m128_zero; - casti_m128i( ctx->S, 1 ) = m128_zero; - casti_m128i( ctx->S, 2 ) = m128_zero; - casti_m128i( ctx->S, 3 ) = m128_zero; -/* - sc->S[0] = _mm_set1_epi32( salt[0] ); - sc->S[1] = _mm_set1_epi32( salt[1] ); - sc->S[2] = _mm_set1_epi32( salt[2] ); - sc->S[3] = _mm_set1_epi32( salt[3] ); -*/ - ctx->T0 = ctx->T1 = 0; - ctx->ptr = 0; - ctx->rounds = rounds; -} - -static void -blake32x2( blake_4way_small_context *ctx, const void *data, size_t len ) -{ - __m128i *buf = (__m256i*)ctx->buf; - size_t bptr = ctx->ptr << 2; - size_t vptr = ctx->ptr >> 3; - size_t blen = len << 2; -// unsigned char *buf = ctx->buf; -// size_t ptr = ctx->ptr<<4; // repurposed - DECL_STATE32x2 - -// buf = sc->buf; -// ptr = sc->ptr; - -// adjust len for use with ptr, clen, all absolute bytes. -// int blen = len<<2; - - if ( blen < (sizeof ctx->buf) - bptr ) - { - memcpy( buf + vptr, data, blen ); - ptr += blen; - ctx->ptr = bptr >> 2;; - return; - } - - READ_STATE32( ctx ); - while ( blen > 0 ) - { - size_t clen; - - clen = ( sizeof sc->buf ) - ptr; - if ( clen > blen ) - clen = blen; - memcpy( buf + vptr, data, clen ); - bptr += clen; - vptr = bptr >> 5; - data = (const unsigned char *)data + clen; - blen -= clen; - if ( bptr == sizeof ctx->buf ) - { - if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover - T1 += 1; - COMPRESS32x2_4WAY( ctx->rounds ); - ptr = 0; - } - } - WRITE_STATE32x2( ctx ); - ctx->ptr = bptr >> 2; -} - -static void -blake32x2_4way_close( blake_4way_small_context *ctx, void *dst ) -{ - __m256i buf[8] __attribute__ ((aligned (64))); - size_t ptr = ctx->ptr; - size_t vptr = ctx->ptr>>2; - unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane - uint32_t th = ctx->T1; - uint32_t tl = ctx->T0 + bit_len; - - if ( ptr == 0 ) - { - ctx->T0 = 0xFFFFFE00UL; - ctx->T1 = 0xFFFFFFFFUL; - } - else if ( ctx->T0 == 0 ) - { - ctx->T0 = 0xFFFFFE00UL + bit_len; - ctx->T1 -= 1; - } - else - ctx->T0 -= 512 - bit_len; - - // memset doesn't do ints - buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 ); - - if ( vptr < 5 ) - { - memset_zero_256( buf + vptr + 1, 6 - vptr ); - buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32( - 0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) ); - buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl, - th,th,th,th ) ); - blake32x2_4way( ctx, buf + vptr, 64 - ptr ); - } - else - { - memset_zero_256( vbuf + vptr + 1, 7 - vptr ); - blake32x2_4way( ctx, vbuf + ptr, 64 - ptr ); - ctx->T0 = 0xFFFFFE00UL; - ctx->T1 = 0xFFFFFFFFUL; - buf[ 6 ] = mm256_zero; - buf[ 6 ] = _mm256_set_epi32( 0,0,0,0, - 0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL ); - buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl, - th, th, th, th ); - blake32x2_4way( ctx, buf, 64 ); - } - - casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) ); - casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) ); - casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) ); - casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) ); -} - - - - -#define DECL_STATE32x2_4WAY \ - __m256i H0, H1, H2, H3; \ - __m256i S0, S1; \ - uint32_t T0, T1; - -#define READ_STATE32x2_4WAY(state) do \ -{ \ - H0 = casti_m256i( state->H, 0 ); \ - H1 = casti_m256i( state->H, 1 ); \ - H2 = casti_m256i( state->H, 2 ); \ - H3 = casti_m256i( state->H, 3 ); \ - S0 = casti_m256i( state->S, 0 ); \ - S1 = casti_m256i( state->S, 1 ); \ - T0 = state->T0; \ - T1 = state->T1; \ - -#define WRITE_STATE32x2_4WAY(state) do { \ - casti_m256i( state->H, 0 ) = H0; \ - casti_m256i( state->H, 1 ) = H1; \ - casti_m256i( state->H, 2 ) = H2; \ - casti_m256i( state->H, 3 ) = H3; \ - casti_m256i( state->S, 0 ) = S0; \ - casti_m256i( state->S, 1 ) = S1; \ - state->T0 = T0; \ - state->T1 = T1; \ -} while (0) - - -#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \ -{ \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \ - _mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \ - d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \ - a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ - _mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \ - _mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \ - d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \ - c = _mm256_add_epi32( c, d ); \ - b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \ -} while (0) - -#define ROUND_Sx2_4WAY(r) do \ -{ \ - GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \ - CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \ - GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \ - CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \ - mm256_ror1x128_512( V3, V2 ); \ - mm256_ror1x128_512( V6, V7 ); \ - GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \ - CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \ - GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \ - CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \ - mm256_rol1x128_512( V2, V3 ); \ - mm256_rol1x128_512( V7, V6 ); - -#define COMPRESS32x2_4WAY( rounds ) do \ -{ \ - __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ - __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ - unsigned r; \ - V0 = H0; \ - V1 = H1; \ - V2 = H2; \ - V3 = H3; \ - V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \ - CS0, CS0, CS0, CS0 ) ); \ - V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \ - CS2, CS2, CS2, CS2 ) ); \ - V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \ - _mm256_set_epi32( CS5, CS5, CS5, CS5, \ - CS4, CS4, CS4, CS4 ) ); \ - V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \ - _mm256_set_epi32( CS7, CS7, CS7, CS7, \ - CS6, CS6, CS6, CS6 ) ); \ - M0 = mm256_bswap_32( buf[ 0] ); \ - M1 = mm256_bswap_32( buf[ 1] ); \ - M2 = mm256_bswap_32( buf[ 2] ); \ - M3 = mm256_bswap_32( buf[ 3] ); \ - M4 = mm256_bswap_32( buf[ 4] ); \ - M5 = mm256_bswap_32( buf[ 5] ); \ - M6 = mm256_bswap_32( buf[ 6] ); \ - M7 = mm256_bswap_32( buf[ 7] ); \ - ROUND_Sx2_4WAY(0); \ - ROUND_Sx2_4WAY(1); \ - ROUND_Sx2_4WAY(2); \ - ROUND_Sx2_4WAY(3); \ - ROUND_Sx2_4WAY(4); \ - ROUND_Sx2_4WAY(5); \ - ROUND_Sx2_4WAY(6); \ - ROUND_Sx2_4WAY(7); \ - if (rounds == 14) \ - { \ - ROUND_Sx2_4WAY(8); \ - ROUND_Sx2_4WAY(9); \ - ROUND_Sx2_4WAY(0); \ - ROUND_Sx2_4WAY(1); \ - ROUND_Sx2_4WAY(2); \ - ROUND_Sx2_4WAY(3); \ - } \ - H0 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( V8, V0 ), S0 ), H0 ); \ - H1 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( V9, V1 ), S1 ), H1 ); \ - H2 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( VA, V2 ), S2 ), H2 ); \ - H3 = _mm256_xor_si256( _mm256_xor_si256( \ - _mm256_xor_si256( VB, V3 ), S3 ), H3 ); \ -} while (0) - - - - - - - - - - - - - - diff --git a/algo/blake/blake2b-4way.c b/algo/blake/blake2b-4way.c new file mode 100644 index 0000000..1bd3b2c --- /dev/null +++ b/algo/blake/blake2b-4way.c @@ -0,0 +1,67 @@ +/** + * Blake2-B Implementation + * tpruvot@github 2015-2016 + */ + +#include "blake2b-gate.h" + +#if defined(BLAKE2B_4WAY) + +#include +#include +#include "blake2b-hash-4way.h" + +// Function not used, code inlined. +void blake2b_4way_hash(void *output, const void *input) +{ + blake2b_4way_ctx ctx; + blake2b_4way_init( &ctx ); + blake2b_4way_update( &ctx, input, 80 ); + blake2b_4way_final( &ctx, output ); +} + +int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t hash[8*4] __attribute__ ((aligned (64)));; + uint32_t vdata[20*4] __attribute__ ((aligned (32)));; + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + blake2b_4way_ctx ctx __attribute__ ((aligned (32))); + uint32_t *hash7 = &(hash[25]); // 3*8+1 + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + int thr_id = mythr->id; + __m256i *noncev = (__m256i*)vdata + 9; // aligned + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + + uint32_t n = first_nonce; + + mm256_bswap32_intrlv80_4x64( vdata, pdata ); + + do { + *noncev = mm256_intrlv_blend_32( mm256_bswap_32( + _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev ); + + blake2b_4way_init( &ctx ); + blake2b_4way_update( &ctx, vdata, 80 ); + blake2b_4way_final( &ctx, hash ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( hash7[ lane<<1 ] < Htarg ) + { + extr_lane_4x64( lane_hash, hash, lane, 256 ); + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) + { + pdata[19] = n + lane; + submit_lane_solution( work, lane_hash, mythr, lane ); + } + } + n += 4; + } while ( (n < max_nonce-4) && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + return 0; +} + +#endif diff --git a/algo/blake/blake2b-gate.c b/algo/blake/blake2b-gate.c new file mode 100644 index 0000000..e875e04 --- /dev/null +++ b/algo/blake/blake2b-gate.c @@ -0,0 +1,25 @@ +#include "blake2b-gate.h" + +/* +// changed to get_max64_0x3fffffLL in cpuminer-multi-decred +int64_t blake2s_get_max64 () +{ + return 0x7ffffLL; +} +*/ + +bool register_blake2b_algo( algo_gate_t* gate ) +{ +#if defined(BLAKE2B_4WAY) + gate->scanhash = (void*)&scanhash_blake2b_4way; + gate->hash = (void*)&blake2b_4way_hash; +#else + gate->scanhash = (void*)&scanhash_blake2b; + gate->hash = (void*)&blake2b_hash; +#endif +// gate->get_max64 = (void*)&blake2s_get_max64; + gate->optimizations = AVX2_OPT; + return true; +}; + + diff --git a/algo/blake/blake2b-gate.h b/algo/blake/blake2b-gate.h new file mode 100644 index 0000000..4ba67f6 --- /dev/null +++ b/algo/blake/blake2b-gate.h @@ -0,0 +1,26 @@ +#ifndef __BLAKE2B_GATE_H__ +#define __BLAKE2B_GATE_H__ 1 + +#include +#include "algo-gate-api.h" + +#if defined(__AVX2__) + #define BLAKE2B_4WAY +#endif + +bool register_blake2b_algo( algo_gate_t* gate ); + +#if defined(BLAKE2B_4WAY) + +void blake2b_4way_hash( void *state, const void *input ); +int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); +#else + +void blake2b_hash( void *state, const void *input ); +int scanhash_blake2b( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + +#endif diff --git a/algo/blake/blake2b-hash-4way.c b/algo/blake/blake2b-hash-4way.c new file mode 100644 index 0000000..93532d5 --- /dev/null +++ b/algo/blake/blake2b-hash-4way.c @@ -0,0 +1,215 @@ +/* + * Copyright 2009 Colin Percival, 2014 savale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include +#include +#include + +#include "blake2b-hash-4way.h" + +#if defined(__AVX2__) + +// G Mixing function. + +#define B2B_G(a, b, c, d, x, y) \ +{ \ + v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \ + v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \ + v[c] = _mm256_add_epi64( v[c], v[d] ); \ + v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \ + v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \ + v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \ + v[c] = _mm256_add_epi64( v[c], v[d] ); \ + v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \ +} + +// Initialization Vector. +/* +static const uint64_t blake2b_iv[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 +}; +*/ + +static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last ) +{ + const uint8_t sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + }; + int i; + __m256i v[16], m[16]; + + v[ 0] = ctx->h[0]; + v[ 1] = ctx->h[1]; + v[ 2] = ctx->h[2]; + v[ 3] = ctx->h[3]; + v[ 4] = ctx->h[4]; + v[ 5] = ctx->h[5]; + v[ 6] = ctx->h[6]; + v[ 7] = ctx->h[7]; + v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 ); + v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B ); + v[10] = m256_const1_64( 0x3C6EF372FE94F82B ); + v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 ); + v[12] = m256_const1_64( 0x510E527FADE682D1 ); + v[13] = m256_const1_64( 0x9B05688C2B3E6C1F ); + v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B ); + v[15] = m256_const1_64( 0x5BE0CD19137E2179 ); + + v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) ); + v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) ); + + if ( last ) + v[14] = mm256_not( v[14] ); + + m[ 0] = ctx->b[ 0]; + m[ 1] = ctx->b[ 1]; + m[ 2] = ctx->b[ 2]; + m[ 3] = ctx->b[ 3]; + m[ 4] = ctx->b[ 4]; + m[ 5] = ctx->b[ 5]; + m[ 6] = ctx->b[ 6]; + m[ 7] = ctx->b[ 7]; + m[ 8] = ctx->b[ 8]; + m[ 9] = ctx->b[ 9]; + m[10] = ctx->b[10]; + m[11] = ctx->b[11]; + m[12] = ctx->b[12]; + m[13] = ctx->b[13]; + m[14] = ctx->b[14]; + m[15] = ctx->b[15]; + + for ( i = 0; i < 12; i++ ) + { + B2B_G( 0, 4, 8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] ); + B2B_G( 1, 5, 9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] ); + B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] ); + B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] ); + B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] ); + B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] ); + B2B_G( 2, 7, 8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] ); + B2B_G( 3, 4, 9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] ); + } + + ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] ); + ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] ); + ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] ); + ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] ); + ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] ); + ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] ); + ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] ); + ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] ); +} + +int blake2b_4way_init( blake2b_4way_ctx *ctx ) +{ + size_t i; + + ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 ); + ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B ); + ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B ); + ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 ); + ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 ); + ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F ); + ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B ); + ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 ); + + ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) ); + + ctx->t[0] = 0; + ctx->t[1] = 0; + ctx->c = 0; + ctx->outlen = 32; + + for ( i = 0; i < 16; i++ ) + ctx->b[i] = m256_zero; + + return 0; +} + +void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, + size_t inlen ) +{ + __m256i* in =(__m256i*)input; + + size_t i, c; + c = ctx->c >> 3; + + for ( i = 0; i < (inlen >> 3); i++ ) + { + if ( ctx->c == 128 ) + { + ctx->t[0] += ctx->c; + if ( ctx->t[0] < ctx->c ) + ctx->t[1]++; + blake2b_4way_compress( ctx, 0 ); + ctx->c = 0; + } + ctx->b[ c++ ] = in[i]; + ctx->c += 8; + } +} + +void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out ) +{ + size_t c; + c = ctx->c >> 3; + + ctx->t[0] += ctx->c; + if ( ctx->t[0] < ctx->c ) + ctx->t[1]++; + + while ( ctx->c < 128 ) + { + ctx->b[c++] = m256_zero; + ctx->c += 8; + } + + blake2b_4way_compress( ctx, 1 ); // final block flag = 1 + + casti_m256i( out, 0 ) = ctx->h[0]; + casti_m256i( out, 1 ) = ctx->h[1]; + casti_m256i( out, 2 ) = ctx->h[2]; + casti_m256i( out, 3 ) = ctx->h[3]; +} + +#endif diff --git a/algo/blake/blake2b-hash-4way.h b/algo/blake/blake2b-hash-4way.h new file mode 100644 index 0000000..30abd15 --- /dev/null +++ b/algo/blake/blake2b-hash-4way.h @@ -0,0 +1,35 @@ +#pragma once +#ifndef __BLAKE2B_HASH_4WAY_H__ +#define __BLAKE2B_HASH_4WAY_H__ + +#if defined(__AVX2__) + +#include "simd-utils.h" +#include +#include + +#if defined(_MSC_VER) +#include +#define inline __inline +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned(x))) +#endif + +// state context +ALIGN(64) typedef struct { + __m256i b[16]; // input buffer + __m256i h[8]; // chained state + uint64_t t[2]; // total number of bytes + size_t c; // pointer for b[] + size_t outlen; // digest size +} blake2b_4way_ctx __attribute__((aligned(64))); + +int blake2b_4way_init( blake2b_4way_ctx *ctx ); +void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, + size_t inlen ); +void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out ); + +#endif + +#endif diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c index 6799116..07b06c2 100644 --- a/algo/blake/blake2b.c +++ b/algo/blake/blake2b.c @@ -3,13 +3,11 @@ * tpruvot@github 2015-2016 */ -#include "algo-gate-api.h" +#include "blake2b-gate.h" #include #include #include "algo/blake/sph_blake2b.h" -//static __thread sph_blake2b_ctx s_midstate; -//static __thread sph_blake2b_ctx s_ctx; #define MIDLEN 76 #define A 64 @@ -25,16 +23,6 @@ void blake2b_hash(void *output, const void *input) memcpy(output, hash, 32); } -/* -static void blake2b_hash_end(uint32_t *output, const uint32_t *input) -{ - s_ctx.outlen = MIDLEN; - memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN); - sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN); - sph_blake2b_final(&s_ctx, (uint8_t*) output); -} -*/ - int scanhash_blake2b( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { @@ -45,7 +33,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; - const uint32_t first_nonce = pdata[8]; + const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; @@ -53,179 +41,23 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce, be32enc(&endiandata[i], pdata[i]); } - // midstate (untested yet) - //blake2b_init(&s_midstate, 32, NULL, 0); - //blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN); - //memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx)); - do { - be32enc(&endiandata[8], n); + be32enc(&endiandata[19], n); //blake2b_hash_end(vhashcpu, endiandata); blake2b_hash(vhashcpu, endiandata); if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) { work_set_target_ratio(work, vhashcpu); *hashes_done = n - first_nonce + 1; - pdata[8] = n; + pdata[19] = n; return 1; } n++; } while (n < max_nonce && !work_restart[thr_id].restart); *hashes_done = n - first_nonce + 1; - pdata[8] = n; + pdata[19] = n; return 0; } -static inline void swab256(void *dest_p, const void *src_p) -{ - uint32_t *dest = (uint32_t *)dest_p; - const uint32_t *src = (uint32_t *)src_p; - - dest[0] = swab32(src[7]); - dest[1] = swab32(src[6]); - dest[2] = swab32(src[5]); - dest[3] = swab32(src[4]); - dest[4] = swab32(src[3]); - dest[5] = swab32(src[2]); - dest[6] = swab32(src[1]); - dest[7] = swab32(src[0]); -} - -/* compute nbits to get the network diff */ -void blake2b_calc_network_diff(struct work *work) -{ - // sample for diff 43.281 : 1c05ea29 - uint32_t nbits = work->data[11]; // unsure if correct - uint32_t bits = (nbits & 0xffffff); - int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 - - double d = (double)0x0000ffff / (double)bits; - for (int m=shift; m < 29; m++) d *= 256.0; - for (int m=29; m < shift; m++) d /= 256.0; - if (opt_debug_diff) - applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); - net_diff = d; -} - -void blake2b_be_build_stratum_request( char *req, struct work *work ) -{ - unsigned char *xnonce2str; - uint32_t ntime, nonce; - char ntimestr[9], noncestr[9]; - be32enc( &ntime, work->data[ algo_gate.ntime_index ] ); - be32enc( &nonce, work->data[ algo_gate.nonce_index ] ); - bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); - bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); - uint16_t high_nonce = swab32(work->data[9]) >> 16; - xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2); - snprintf( req, JSON_BUF_LEN, - "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); - free( xnonce2str ); -} - -#define min(a,b) (a>b ? (b) :(a)) - -// merkle root handled here, no need for gen_merkle_root gate target -void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) -{ - uchar merkle_root[64] = { 0 }; - uint32_t extraheader[32] = { 0 }; - int headersize = 0; - size_t t; - int i; - - // merkle root - memcpy( merkle_root, sctx->job.coinbase, 32 ); - headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) ); - memcpy( extraheader, &sctx->job.coinbase[32], headersize ); - // Increment extranonce2 - for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - // Assemble block header - memset( g_work->data, 0, sizeof(g_work->data) ); -// g_work->data[0] = le32dec( sctx->job.version ); -// for ( i = 0; i < 8; i++ ) -// g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i ); - for ( i = 0; i < 8; i++ ) - g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i]; -// for ( i = 0; i < 8; i++ ) -// g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); - g_work->data[8] = 0; // nonce - g_work->data[9] = swab32( extraheader[0] ) | ( rand() & 0xf0 ); - g_work->data[10] = be32dec( sctx->job.ntime ); - g_work->data[11] = be32dec( sctx->job.nbits ); - for ( i = 0; i < 8; i++ ) - g_work->data[12+i] = ( (uint32_t*)merkle_root )[i]; -} - -#undef min - -void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id, - uint32_t* end_nonce_ptr, bool clean_job ) -{ - const int wkcmp_sz = 32; // bytes - const int wkcmp_off = 32 + 16; - uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); - - if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz ) - && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) - || strcmp( work->job_id, g_work->job_id ) ) ) - { - work_free( work ); - work_copy( work, g_work ); - *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id; - if ( opt_randomize ) - *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; - *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; - } - else - ++(*nonceptr); - - // suprnova job_id check without data/target/height change... - // we just may have copied new g_wwork to work so why this test here? -// if ( have_stratum && strcmp( work->job_id, g_work->job_id ) ) - // exit thread loop -// continue; -// else -// { -// nonceptr[1] += 0x10; -// nonceptr[1] |= thr_id; -// } -} - -bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum, - int thr_id ) -{ - if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) ) - // need to regen g_work.. - return false; - // extradata: prevent duplicates - work->data[ 8 ] += 0x10; - work->data[ 8 + 1 ] |= thr_id; - return true; -} - -double blake2b_get_max64() { return 0x1fffffLL; } - -bool register_blake2b_algo( algo_gate_t* gate ) -{ - algo_not_tested(); - gate->ntime_index = 10; - gate->nbits_index = 11; - gate->nonce_index = 8; - gate->work_cmp_size = 32; - gate->scanhash = (void*)&scanhash_blake2b; - gate->hash = (void*)&blake2b_hash; - gate->calc_network_diff = (void*)&blake2b_calc_network_diff; - gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request; - gate->work_decode = (void*)&std_be_work_decode; - gate->submit_getwork_result = (void*)&std_be_submit_getwork_result; - gate->build_extraheader = (void*)&blake2b_build_extraheader; - gate->get_new_work = (void*)&blake2b_get_new_work; - gate->get_max64 = (void*)&blake2b_get_max64; - gate->ready_to_mine = (void*)&blake2b_ready_to_mine; - have_gbt = false; - return true; -} diff --git a/algo/blake/blake2s-gate.c b/algo/blake/blake2s-gate.c index 2af35d5..68ace1a 100644 --- a/algo/blake/blake2s-gate.c +++ b/algo/blake/blake2s-gate.c @@ -20,7 +20,7 @@ bool register_blake2s_algo( algo_gate_t* gate ) gate->hash = (void*)&blake2s_hash; #endif gate->get_max64 = (void*)&blake2s_get_max64; - gate->optimizations = SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT; return true; }; diff --git a/algo/blake/blake2s-gate.h b/algo/blake/blake2s-gate.h index ee1a243..a9c8b8f 100644 --- a/algo/blake/blake2s-gate.h +++ b/algo/blake/blake2s-gate.h @@ -4,7 +4,8 @@ #include #include "algo-gate-api.h" -#if defined(__SSE4_2__) +//#if defined(__SSE4_2__) +#if defined(__SSE2__) #define BLAKE2S_4WAY #endif #if defined(__AVX2__) diff --git a/algo/blake/blake2s-hash-4way.c b/algo/blake/blake2s-hash-4way.c index f3bbe35..155518c 100644 --- a/algo/blake/blake2s-hash-4way.c +++ b/algo/blake/blake2s-hash-4way.c @@ -17,7 +17,9 @@ #include #include -#if defined(__SSE4_2__) +//#if defined(__SSE4_2__) +#if defined(__SSE2__) + static const uint32_t blake2s_IV[8] = { @@ -57,8 +59,18 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen ) memset( P->personal, 0, sizeof( P->personal ) ); memset( S, 0, sizeof( blake2s_4way_state ) ); - for( int i = 0; i < 8; ++i ) - S->h[i] = _mm_set1_epi32( blake2s_IV[i] ); + + S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL ); + S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL ); + S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL ); + S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL ); + S->h[4] = m128_const1_64( 0x510E527F510E527FULL ); + S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL ); + S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL ); + S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL ); + +// for( int i = 0; i < 8; ++i ) +// S->h[i] = _mm_set1_epi32( blake2s_IV[i] ); uint32_t *p = ( uint32_t * )( P ); @@ -267,8 +279,18 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen ) memset( P->personal, 0, sizeof( P->personal ) ); memset( S, 0, sizeof( blake2s_8way_state ) ); - for( int i = 0; i < 8; ++i ) - S->h[i] = _mm256_set1_epi32( blake2s_IV[i] ); + S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL ); + S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL ); + S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL ); + S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL ); + S->h[4] = m256_const1_64( 0x510E527F510E527FULL ); + S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL ); + S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL ); + S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL ); + + +// for( int i = 0; i < 8; ++i ) +// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] ); uint32_t *p = ( uint32_t * )( P ); diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h index 3457829..45d5de9 100644 --- a/algo/blake/blake2s-hash-4way.h +++ b/algo/blake/blake2s-hash-4way.h @@ -14,7 +14,8 @@ #ifndef __BLAKE2S_HASH_4WAY_H__ #define __BLAKE2S_HASH_4WAY_H__ 1 -#if defined(__SSE4_2__) +//#if defined(__SSE4_2__) +#if defined(__SSE2__) #include "simd-utils.h" diff --git a/algo/blake/blake512-hash-4way.c b/algo/blake/blake512-hash-4way.c index b57f712..a710dbf 100644 --- a/algo/blake/blake512-hash-4way.c +++ b/algo/blake/blake512-hash-4way.c @@ -307,12 +307,12 @@ static const sph_u64 CB[16] = { #define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \ a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \ - _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \ + _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \ d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \ c = _mm256_add_epi64( c, d ); \ b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \ a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \ - _mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \ + _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \ d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \ c = _mm256_add_epi64( c, d ); \ b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \ @@ -479,20 +479,20 @@ static const sph_u64 CB[16] = { V5 = H5; \ V6 = H6; \ V7 = H7; \ - V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \ - V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \ - VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \ - VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \ + V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) ); \ + V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) ); \ + VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) ); \ + VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) ); \ VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ - _mm256_set1_epi64x( CB4 ) ); \ + m256_const1_64( CB4 ) ); \ VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \ - _mm256_set1_epi64x( CB5 ) ); \ + m256_const1_64( CB5 ) ); \ VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \ - _mm256_set1_epi64x( CB6 ) ); \ + m256_const1_64( CB6 ) ); \ VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \ - _mm256_set1_epi64x( CB7 ) ); \ - shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \ - 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ + m256_const1_64( CB7 ) ); \ + shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \ + 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \ M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \ @@ -544,14 +544,14 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv, const sph_u64 *salt ) { __m256i zero = m256_zero; - casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] ); - casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] ); - casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] ); - casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] ); - casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] ); - casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] ); - casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] ); - casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] ); + casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 ); + casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B ); + casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B ); + casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 ); + casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 ); + casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F ); + casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B ); + casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 ); casti_m256i( sc->S, 0 ) = zero; casti_m256i( sc->S, 1 ) = zero; @@ -642,11 +642,9 @@ blake64_4way_close( blake_4way_big_context *sc, memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 ); if ( out_size_w64 == 8 ) buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)], - _mm256_set1_epi64x( 0x0100000000000000ULL ) ); - *(buf+(112>>3)) = mm256_bswap_64( - _mm256_set_epi64x( th, th, th, th ) ); - *(buf+(120>>3)) = mm256_bswap_64( - _mm256_set_epi64x( tl, tl, tl, tl ) ); + m256_const1_64( 0x0100000000000000ULL ) ); + *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) ); + *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) ); blake64_4way( sc, buf + (ptr>>3), 128 - ptr ); } @@ -659,11 +657,9 @@ blake64_4way_close( blake_4way_big_context *sc, sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL); memset_zero_256( buf, 112>>3 ); if ( out_size_w64 == 8 ) - buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL ); - *(buf+(112>>3)) = mm256_bswap_64( - _mm256_set_epi64x( th, th, th, th ) ); - *(buf+(120>>3)) = mm256_bswap_64( - _mm256_set_epi64x( tl, tl, tl, tl ) ); + buf[104>>3] = m256_const1_64( 0x0100000000000000ULL ); + *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) ); + *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) ); blake64_4way( sc, buf, 128 ); } diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index ca898dc..a17d7d7 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -103,7 +103,6 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last ) v[13] ^= ctx->t[1]; // high 64 bits if (last) // last block flag set ? v[14] = ~v[14]; - for (i = 0; i < 16; i++) // get little-endian words m[i] = B2B_GET64(&ctx->b[8 * i]); @@ -184,7 +183,8 @@ void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out ) while (ctx->c < 128) // fill up with zeros ctx->b[ctx->c++] = 0; - blake2b_compress(ctx, 1); // final block flag = 1 + + blake2b_compress(ctx, 1); // final block flag = 1 // little endian convert and store for (i = 0; i < ctx->outlen; i++) { diff --git a/algo/bmw/bmw-hash-4way.h b/algo/bmw/bmw-hash-4way.h index dcdb41d..bb23705 100644 --- a/algo/bmw/bmw-hash-4way.h +++ b/algo/bmw/bmw-hash-4way.h @@ -62,7 +62,7 @@ typedef struct { typedef bmw_4way_small_context bmw256_4way_context; -void bmw256_4way_init(void *cc); +void bmw256_4way_init( bmw256_4way_context *ctx ); void bmw256_4way(void *cc, const void *data, size_t len); diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c index 8f785e3..b5cda8f 100644 --- a/algo/bmw/bmw256-hash-4way.c +++ b/algo/bmw/bmw256-hash-4way.c @@ -48,7 +48,7 @@ extern "C"{ #if defined(__SSE2__) // BMW-256 4 way 32 - +/* static const uint32_t IV256[] = { 0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F, @@ -59,6 +59,7 @@ static const uint32_t IV256[] = { 0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F }; +*/ #define ss0(x) \ _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \ @@ -462,13 +463,30 @@ static const __m128i final_s[16] = { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf } }; */ -static void -bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv) +void bmw256_4way_init( bmw256_4way_context *ctx ) { - for ( int i = 0; i < 16; i++ ) - sc->H[i] = _mm_set1_epi32( iv[i] ); - sc->ptr = 0; - sc->bit_count = 0; + ctx->H[ 0] = m128_const1_64( 0x4041424340414243 ); + ctx->H[ 1] = m128_const1_64( 0x4445464744454647 ); + ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B ); + ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F ); + ctx->H[ 4] = m128_const1_64( 0x5051525350515253 ); + ctx->H[ 5] = m128_const1_64( 0x5455565754555657 ); + ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B ); + ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F ); + ctx->H[ 8] = m128_const1_64( 0x6061626360616263 ); + ctx->H[ 9] = m128_const1_64( 0x6465666764656667 ); + ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B ); + ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F ); + ctx->H[12] = m128_const1_64( 0x7071727370717273 ); + ctx->H[13] = m128_const1_64( 0x7475767774757677 ); + ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B ); + ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F ); + + +// for ( int i = 0; i < 16; i++ ) +// sc->H[i] = _mm_set1_epi32( iv[i] ); + ctx->ptr = 0; + ctx->bit_count = 0; } static void @@ -525,7 +543,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, buf = sc->buf; ptr = sc->ptr; - buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 ); + buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 ); ptr += 4; h = sc->H; @@ -551,11 +569,13 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, casti_m128i( dst, u ) = h1[v]; } +/* void bmw256_4way_init(void *cc) { bmw32_4way_init(cc, IV256); } +*/ void bmw256_4way(void *cc, const void *data, size_t len) @@ -1003,25 +1023,24 @@ static const __m256i final_s8[16] = void bmw256_8way_init( bmw256_8way_context *ctx ) { - ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] ); - ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] ); - ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] ); - ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] ); - ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] ); - ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] ); - ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] ); - ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] ); - ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] ); - ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] ); - ctx->H[10] = _mm256_set1_epi32( IV256[10] ); - ctx->H[11] = _mm256_set1_epi32( IV256[11] ); - ctx->H[12] = _mm256_set1_epi32( IV256[12] ); - ctx->H[13] = _mm256_set1_epi32( IV256[13] ); - ctx->H[14] = _mm256_set1_epi32( IV256[14] ); - ctx->H[15] = _mm256_set1_epi32( IV256[15] ); + ctx->H[ 0] = m256_const1_64( 0x4041424340414243 ); + ctx->H[ 1] = m256_const1_64( 0x4445464744454647 ); + ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B ); + ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F ); + ctx->H[ 4] = m256_const1_64( 0x5051525350515253 ); + ctx->H[ 5] = m256_const1_64( 0x5455565754555657 ); + ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B ); + ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F ); + ctx->H[ 8] = m256_const1_64( 0x6061626360616263 ); + ctx->H[ 9] = m256_const1_64( 0x6465666764656667 ); + ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B ); + ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F ); + ctx->H[12] = m256_const1_64( 0x7071727370717273 ); + ctx->H[13] = m256_const1_64( 0x7475767774757677 ); + ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B ); + ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F ); ctx->ptr = 0; ctx->bit_count = 0; - } void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len ) @@ -1074,7 +1093,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ) buf = ctx->buf; ptr = ctx->ptr; - buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); + buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 ); ptr += 4; h = ctx->H; @@ -1089,7 +1108,6 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst ) buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count ); buf[ (buf_size - 4) >> 2 ] = m256_zero; - compress_small_8way( buf, h, h2 ); for ( u = 0; u < 16; u ++ ) diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index 7c58003..e893c87 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -961,8 +961,22 @@ static const __m256i final_b[16] = static void bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv ) { - for ( int i = 0; i < 16; i++ ) - sc->H[i] = _mm256_set1_epi64x( iv[i] ); + sc->H[ 0] = m256_const1_64( 0x8081828384858687 ); + sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F ); + sc->H[ 2] = m256_const1_64( 0x9091929394959697 ); + sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F ); + sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 ); + sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF ); + sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 ); + sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF ); + sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 ); + sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF ); + sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 ); + sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF ); + sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 ); + sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF ); + sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 ); + sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF ); sc->ptr = 0; sc->bit_count = 0; } @@ -1014,13 +1028,11 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n, __m256i *buf; __m256i h1[16], h2[16], *h; size_t ptr, u, v; - unsigned z; const int buf_size = 128; // bytes of one lane, compatible with len buf = sc->buf; ptr = sc->ptr; - z = 0x80 >> n; - buf[ ptr>>3 ] = _mm256_set1_epi64x( z ); + buf[ ptr>>3 ] = m256_const1_64( 0x80 ); ptr += 8; h = sc->H; diff --git a/algo/groestl/myr-groestl.c b/algo/groestl/myr-groestl.c index d66260a..5a3b897 100644 --- a/algo/groestl/myr-groestl.c +++ b/algo/groestl/myr-groestl.c @@ -10,7 +10,7 @@ #else #include "aes_ni/hash-groestl.h" #endif -#include "algo/sha/sph_sha2.h" +#include typedef struct { #ifdef NO_AES_NI @@ -18,7 +18,7 @@ typedef struct { #else hashState_groestl groestl; #endif - sph_sha256_context sha; + SHA256_CTX sha; } myrgr_ctx_holder; myrgr_ctx_holder myrgr_ctx; @@ -28,15 +28,15 @@ void init_myrgr_ctx() #ifdef NO_AES_NI sph_groestl512_init( &myrgr_ctx.groestl ); #else - init_groestl (&myrgr_ctx.groestl, 64 ); + init_groestl ( &myrgr_ctx.groestl, 64 ); #endif - sph_sha256_init(&myrgr_ctx.sha); + SHA256_Init( &myrgr_ctx.sha ); } void myriad_hash(void *output, const void *input) { - myrgr_ctx_holder ctx; - memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) ); + myrgr_ctx_holder ctx; + memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) ); uint32_t _ALIGN(32) hash[16]; @@ -44,23 +44,22 @@ void myriad_hash(void *output, const void *input) sph_groestl512(&ctx.groestl, input, 80); sph_groestl512_close(&ctx.groestl, hash); #else - update_groestl( &ctx.groestl, (char*)input, 640 ); - final_groestl( &ctx.groestl, (char*)hash); + update_groestl( &ctx.groestl, (char*)input, 640 ); + final_groestl( &ctx.groestl, (char*)hash); #endif - sph_sha256(&ctx.sha, hash, 64); - sph_sha256_close(&ctx.sha, hash); + SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 ); + SHA256_Final( (unsigned char*)hash, &ctx.sha ); memcpy(output, hash, 32); } -int scanhash_myriad( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr) +int scanhash_myriad( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; uint32_t nonce = first_nonce; int thr_id = mythr->id; // thr_id arg is deprecated diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index b7ba0eb..17f0cf1 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -8,7 +8,7 @@ #include #include "aes_ni/hash-groestl.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" typedef struct { hashState_groestl groestl; diff --git a/algo/groestl/myrgr-gate.h b/algo/groestl/myrgr-gate.h index 89fc5f1..706bdb7 100644 --- a/algo/groestl/myrgr-gate.h +++ b/algo/groestl/myrgr-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(__AVX2__) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__) #define MYRGR_4WAY #endif diff --git a/algo/jh/jh-hash-4way.c b/algo/jh/jh-hash-4way.c index 111e5f3..278b774 100644 --- a/algo/jh/jh-hash-4way.c +++ b/algo/jh/jh-hash-4way.c @@ -246,18 +246,12 @@ do { \ } while (0) */ -#define W0(x) Wz(x, _mm256_set_epi64x( 0x5555555555555555, \ - 0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 ) -#define W1(x) Wz(x, _mm256_set_epi64x( 0x3333333333333333, \ - 0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 ) -#define W2(x) Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \ - 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 ) -#define W3(x) Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \ - 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) -#define W4(x) Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \ - 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 ) -#define W5(x) Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \ - 0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 ) +#define W0(x) Wz(x, m256_const1_64( 0x5555555555555555 ), 1 ) +#define W1(x) Wz(x, m256_const1_64( 0x3333333333333333 ), 2 ) +#define W2(x) Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 ) +#define W3(x) Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ), 8 ) +#define W4(x) Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 ) +#define W5(x) Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 ) #define W6(x) \ do { \ __m256i t = x ## h; \ @@ -331,14 +325,14 @@ do { \ __m256i m2l = buf[5]; \ __m256i m3h = buf[6]; \ __m256i m3l = buf[7]; \ - h0h = _mm256_xor_si256( h0h, m0h ); \ - h0l = _mm256_xor_si256( h0l, m0l ); \ - h1h = _mm256_xor_si256( h1h, m1h ); \ - h1l = _mm256_xor_si256( h1l, m1l ); \ - h2h = _mm256_xor_si256( h2h, m2h ); \ - h2l = _mm256_xor_si256( h2l, m2l ); \ - h3h = _mm256_xor_si256( h3h, m3h ); \ - h3l = _mm256_xor_si256( h3l, m3l ); \ + h0h = _mm256_xor_si256( h0h, m0h ); \ + h0l = _mm256_xor_si256( h0l, m0l ); \ + h1h = _mm256_xor_si256( h1h, m1h ); \ + h1l = _mm256_xor_si256( h1l, m1l ); \ + h2h = _mm256_xor_si256( h2h, m2h ); \ + h2l = _mm256_xor_si256( h2l, m2l ); \ + h3h = _mm256_xor_si256( h3h, m3h ); \ + h3l = _mm256_xor_si256( h3l, m3l ); \ #define INPUT_BUF2 \ h4h = _mm256_xor_si256( h4h, m0h ); \ @@ -477,13 +471,48 @@ static const sph_u64 IV512[] = { #endif -static void -jh_4way_init( jh_4way_context *sc, const void *iv ) +void jh256_4way_init( jh_4way_context *sc ) { - uint64_t *v = (uint64_t*)iv; - - for ( int i = 0; i < 16; i++ ) - sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] ); + // bswapped IV256 + sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb ); + sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 ); + sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c ); + sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 ); + sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 ); + sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 ); + sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd ); + sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 ); + sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 ); + sc->H[ 9] = m256_const1_64( 0x62e27df70849141d ); + sc->H[10] = m256_const1_64( 0x948f2476f7957627 ); + sc->H[11] = m256_const1_64( 0x6c29804757b6d587 ); + sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c ); + sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 ); + sc->H[14] = m256_const1_64( 0xea12247067d3e47b ); + sc->H[15] = m256_const1_64( 0x69d71cd313abe389 ); + sc->ptr = 0; + sc->block_count = 0; +} + +void jh512_4way_init( jh_4way_context *sc ) +{ + // bswapped IV512 + sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f ); + sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 ); + sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a ); + sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 ); + sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 ); + sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a ); + sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 ); + sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 ); + sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 ); + sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 ); + sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 ); + sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b ); + sc->H[12] = m256_const1_64( 0x56f8b19decf657cf ); + sc->H[13] = m256_const1_64( 0x56b116577c8806a7 ); + sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 ); + sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 ); sc->ptr = 0; sc->block_count = 0; } @@ -542,7 +571,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst, size_t numz, u; sph_u64 l0, l1, l0e, l1e; - buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 ); + buf[0] = m256_const1_64( 0x80ULL ); if ( sc->ptr == 0 ) numz = 48; @@ -555,8 +584,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst, l1 = SPH_T64(sc->block_count >> 55); sph_enc64be( &l0e, l0 ); sph_enc64be( &l1e, l1 ); - *(buf + (numz>>3) ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e ); - *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); + *(buf + (numz>>3) ) = _mm256_set1_epi64x( l1e ); + *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); jh_4way_core( sc, buf, numz + 16 ); @@ -566,11 +595,13 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst, memcpy_256( dst256, buf, 8 ); } +/* void jh256_4way_init(void *cc) { - jh_4way_init(cc, IV256); + jhs_4way_init(cc, IV256); } +*/ void jh256_4way(void *cc, const void *data, size_t len) @@ -584,11 +615,13 @@ jh256_4way_close(void *cc, void *dst) jh_4way_close(cc, 0, 0, dst, 8, IV256); } +/* void jh512_4way_init(void *cc) { - jh_4way_init(cc, IV512); + jhb_4way_init(cc, IV512); } +*/ void jh512_4way(void *cc, const void *data, size_t len) diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h index 14ad113..a634ecc 100644 --- a/algo/jh/jh-hash-4way.h +++ b/algo/jh/jh-hash-4way.h @@ -79,13 +79,13 @@ typedef jh_4way_context jh256_4way_context; typedef jh_4way_context jh512_4way_context; -void jh256_4way_init(void *cc); +void jh256_4way_init( jh_4way_context *sc); void jh256_4way(void *cc, const void *data, size_t len); void jh256_4way_close(void *cc, void *dst); -void jh512_4way_init(void *cc); +void jh512_4way_init( jh_4way_context *sc ); void jh512_4way(void *cc, const void *data, size_t len); diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index 5d2c87d..f7be5bb 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -39,10 +39,10 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, keccakhash_4way( hash, vdata ); for ( int lane = 0; lane < 4; lane++ ) - if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) ) + if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) { extr_lane_4x64( lane_hash, hash, lane, 256 ); - if ( fulltest( lane_hash, ptarget ) ) + if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) { pdata[19] = n + lane; submit_lane_solution( work, lane_hash, mythr, lane ); diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index b8646d0..bb31081 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -370,18 +370,23 @@ static const sph_u64 RC[] = { static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size ) { - int i; - for (i = 0; i < 25; i ++) - kc->w[i] = _mm256_setzero_si256(); + __m256i zero = m256_zero; + __m256i neg1 = m256_neg1; // Initialization for the "lane complement". - kc->w[ 1] = m256_neg1; - kc->w[ 2] = m256_neg1; - kc->w[ 8] = m256_neg1; - kc->w[12] = m256_neg1; - kc->w[17] = m256_neg1; - kc->w[20] = m256_neg1; - kc->ptr = 0; + kc->w[ 0] = zero; kc->w[ 1] = neg1; + kc->w[ 2] = neg1; kc->w[ 3] = zero; + kc->w[ 4] = zero; kc->w[ 5] = zero; + kc->w[ 6] = zero; kc->w[ 7] = zero; + kc->w[ 8] = neg1; kc->w[ 9] = zero; + kc->w[10] = zero; kc->w[11] = zero; + kc->w[12] = neg1; kc->w[13] = zero; + kc->w[14] = zero; kc->w[15] = zero; + kc->w[16] = zero; kc->w[17] = neg1; + kc->w[18] = zero; kc->w[19] = zero; + kc->w[20] = neg1; kc->w[21] = zero; + kc->w[22] = zero; kc->w[23] = zero; + kc->w[24] = zero; kc->ptr = 0; kc->lim = 200 - (out_size >> 2); } @@ -441,8 +446,8 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, eb = 0x100 >> 8; if ( kc->ptr == (lim - 8) ) { - uint64_t t = eb | 0x8000000000000000; - u.tmp[0] = _mm256_set_epi64x( t, t, t, t ); + const uint64_t t = eb | 0x8000000000000000; + u.tmp[0] = m256_const1_64( t ); j = 8; } else @@ -450,8 +455,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, j = lim - kc->ptr; u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb ); memset_zero_256( u.tmp + 1, (j>>3) - 2 ); - u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000, - 0x8000000000000000, 0x8000000000000000, 0x8000000000000000); + u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 ); } keccak64_core( kc, u.tmp, j, lim ); /* Finalize the "lane complement" */ @@ -461,9 +465,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, NOT64( kc->w[12], kc->w[12] ); NOT64( kc->w[17], kc->w[17] ); NOT64( kc->w[20], kc->w[20] ); - for ( j = 0; j < m256_len; j++ ) - u.tmp[j] = kc->w[j]; - memcpy_256( dst, u.tmp, m256_len ); + memcpy_256( dst, kc->w, m256_len ); } void keccak256_4way_init( void *kc ) diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c index 5c65d4e..8db05dc 100644 --- a/algo/lyra2/lyra2.c +++ b/algo/lyra2/lyra2.c @@ -60,7 +60,7 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, int64_t step = 1; //Visitation step (used during Setup and Wandering phases) int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter +// int64_t i; //auxiliary iteration counter int64_t v64; // 64bit var for memcpy //====================================================================/ @@ -128,17 +128,22 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, //================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - initState( state ); +// initState( state ); //========================= Setup Phase =============================// //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits ptrWord = wholeMatrix; + + absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN ); +/* for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) } +*/ + //Initializes M[0] and M[1] reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here @@ -227,7 +232,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, int64_t step = 1; //Visitation step (used during Setup and Wandering phases) int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter +// int64_t i; //auxiliary iteration counter int64_t v64; // 64bit var for memcpy uint64_t instance = 0; //====================================================================/ @@ -302,17 +307,21 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, //================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - initState( state ); +// initState( state ); //========================= Setup Phase =============================// //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits ptrWord = wholeMatrix; + + absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN ); +/* for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) } +*/ //Initializes M[0] and M[1] reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here @@ -405,7 +414,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, int64_t step = 1; //Visitation step (used during Setup and Wandering phases) int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 - int64_t i; //auxiliary iteration counter +// int64_t i; //auxiliary iteration counter //=======================================================================/ //======= Initializing the Memory Matrix and pointers to it =============// @@ -459,17 +468,21 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, // if (state == NULL) { // return -1; // } - initState( state ); +// initState( state ); //============================== Setup Phase =============================// //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits - uint64_t *ptrWord = wholeMatrix; + uint64_t *ptrWord = wholeMatrix; + + absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, + BLOCK_LEN_BLAKE2_SAFE_INT64 ); +/* for ( i = 0; i < nBlocksInput; i++ ) { absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) } - +*/ //Initializes M[0] and M[1] reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols); @@ -623,17 +636,21 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen, //================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - initState( state ); +// initState( state ); //========================= Setup Phase =============================// //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits ptrWord = wholeMatrix; + + absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN ); +/* for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil) ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) } +*/ //Initializes M[0] and M[1] reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c index 389aebf..989653a 100644 --- a/algo/lyra2/lyra2rev3-4way.c +++ b/algo/lyra2/lyra2rev3-4way.c @@ -86,7 +86,7 @@ void lyra2rev3_8way_hash( void *state, const void *input ) } -int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce, +int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[8*8] __attribute__ ((aligned (64))); @@ -94,12 +94,12 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce, uint32_t *hash7 = &(hash[7<<3]); uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; const uint32_t Htarg = ptarget[7]; __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + const int thr_id = mythr->id; // thr_id arg is deprecated if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; @@ -186,7 +186,7 @@ void lyra2rev3_4way_hash( void *state, const void *input ) bmw256_4way_close( &ctx.bmw, state ); } -int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce, +int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t hash[8*4] __attribute__ ((aligned (64))); @@ -194,12 +194,12 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce, uint32_t *hash7 = &(hash[7<<2]); uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; const uint32_t Htarg = ptarget[7]; __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + const int thr_id = mythr->id; // thr_id arg is deprecated if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c index f465960..5037caf 100644 --- a/algo/lyra2/sponge.c +++ b/algo/lyra2/sponge.c @@ -40,29 +40,32 @@ */ inline void initState( uint64_t State[/*16*/] ) { + + /* #if defined (__AVX2__) __m256i* state = (__m256i*)State; - - state[0] = _mm256_setzero_si256(); - state[1] = _mm256_setzero_si256(); - state[2] = _mm256_set_epi64x( blake2b_IV[3], blake2b_IV[2], - blake2b_IV[1], blake2b_IV[0] ); - state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6], - blake2b_IV[5], blake2b_IV[4] ); + const __m256i zero = m256_zero; + state[0] = zero; + state[1] = zero; + state[2] = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL, + 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL ); + state[3] = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL, + 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL ); #elif defined (__SSE2__) __m128i* state = (__m128i*)State; + const __m128i zero = m128_zero; - state[0] = _mm_setzero_si128(); - state[1] = _mm_setzero_si128(); - state[2] = _mm_setzero_si128(); - state[3] = _mm_setzero_si128(); - state[4] = _mm_set_epi64x( blake2b_IV[1], blake2b_IV[0] ); - state[5] = _mm_set_epi64x( blake2b_IV[3], blake2b_IV[2] ); - state[6] = _mm_set_epi64x( blake2b_IV[5], blake2b_IV[4] ); - state[7] = _mm_set_epi64x( blake2b_IV[7], blake2b_IV[6] ); + state[0] = zero; + state[1] = zero; + state[2] = zero; + state[3] = zero; + state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL ); + state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL ); + state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL ); + state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL ); #else //First 512 bis are zeros @@ -77,6 +80,8 @@ inline void initState( uint64_t State[/*16*/] ) State[14] = blake2b_IV[6]; State[15] = blake2b_IV[7]; #endif +*/ + } /** @@ -250,43 +255,76 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In ) * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ -inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In ) +inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In, + const uint64_t nBlocks, const uint64_t block_len ) { - //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state +// XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with +// the IV. #if defined (__AVX2__) - register __m256i state0, state1, state2, state3; + register __m256i state0, state1, state2, state3; + const __m256i zero = m256_zero; + + state0 = zero; + state1 = zero; + state2 = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL, + 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL ); + state3 = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL, + 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL ); + + for ( int i = 0; i < nBlocks; i++ ) + { __m256i *in = (__m256i*)In; - - state0 = _mm256_load_si256( (__m256i*)State ); - state1 = _mm256_load_si256( (__m256i*)State + 1 ); - state2 = _mm256_load_si256( (__m256i*)State + 2 ); - state3 = _mm256_load_si256( (__m256i*)State + 3 ); - state0 = _mm256_xor_si256( state0, in[0] ); state1 = _mm256_xor_si256( state1, in[1] ); LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 ); + In += block_len; + } - _mm256_store_si256( (__m256i*)State, state0 ); - _mm256_store_si256( (__m256i*)State + 1, state1 ); - _mm256_store_si256( (__m256i*)State + 2, state2 ); - _mm256_store_si256( (__m256i*)State + 3, state3 ); + _mm256_store_si256( (__m256i*)State, state0 ); + _mm256_store_si256( (__m256i*)State + 1, state1 ); + _mm256_store_si256( (__m256i*)State + 2, state2 ); + _mm256_store_si256( (__m256i*)State + 3, state3 ); #elif defined (__SSE2__) - __m128i* state = (__m128i*)State; + __m128i state0, state1, state2, state3, state4, state5, state6, state7; + const __m128i zero = m128_zero; + + state0 = zero; + state1 = zero; + state2 = zero; + state3 = zero; + state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL ); + state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL ); + state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL ); + state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL ); + + for ( int i = 0; i < nBlocks; i++ ) + { __m128i* in = (__m128i*)In; - state[0] = _mm_xor_si128( state[0], in[0] ); - state[1] = _mm_xor_si128( state[1], in[1] ); - state[2] = _mm_xor_si128( state[2], in[2] ); - state[3] = _mm_xor_si128( state[3], in[3] ); + state0 = _mm_xor_si128( state0, in[0] ); + state1 = _mm_xor_si128( state1, in[1] ); + state2 = _mm_xor_si128( state2, in[2] ); + state3 = _mm_xor_si128( state3, in[3] ); //Applies the transformation f to the sponge's state - LYRA_12_ROUNDS_AVX( state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7] ); + LYRA_12_ROUNDS_AVX( state0, state1, state2, state3, + state4, state5, state6, state7 ); + In += block_len; + } + _mm_store_si128( (__m128i*)State, state0 ); + _mm_store_si128( (__m128i*)State + 1, state1 ); + _mm_store_si128( (__m128i*)State + 2, state2 ); + _mm_store_si128( (__m128i*)State + 3, state3 ); + _mm_store_si128( (__m128i*)State + 4, state4 ); + _mm_store_si128( (__m128i*)State + 5, state5 ); + _mm_store_si128( (__m128i*)State + 6, state6 ); + _mm_store_si128( (__m128i*)State + 7, state7 ); + #else State[0] ^= In[0]; diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 76be768..b423049 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -170,7 +170,8 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols); //---- Absorbs void absorbBlock(uint64_t *state, const uint64_t *in); -void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); +void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in, + const uint64_t nBlocks, const uint64_t block_len ); //---- Duplexes void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); diff --git a/algo/m7m.c b/algo/m7m.c index a45ec34..d55a317 100644 --- a/algo/m7m.c +++ b/algo/m7m.c @@ -19,100 +19,89 @@ #define EPS1 DBL_EPSILON #define EPS2 3.0e-11 -inline double exp_n(double xt) +inline double exp_n( double xt ) { - if(xt < -700.0) + if ( xt < -700.0 ) return 0; - else if(xt > 700.0) + else if ( xt > 700.0 ) return 1e200; - else if(xt > -0.8e-8 && xt < 0.8e-8) - return (1.0 + xt); + else if ( xt > -0.8e-8 && xt < 0.8e-8 ) + return ( 1.0 + xt ); else - return exp(xt); + return exp( xt ); } -inline double exp_n2(double x1, double x2) +inline double exp_n2( double x1, double x2 ) { - double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.; + double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, + p5 = 37., p6 = 700.; double xt = x1 - x2; - if (xt < p1+1.e-200) + if ( xt < p1+1.e-200 ) return 1.; - else if (xt > p1 && xt < p2 + 1.e-200) + else if ( xt > p1 && xt < p2 + 1.e-200 ) return ( 1. - exp(xt) ); - else if (xt > p2 && xt < p3 + 1.e-200) - return ( 1. / (1. + exp(xt)) ); - else if (xt > p3 && xt < p4) + else if ( xt > p2 && xt < p3 + 1.e-200 ) + return ( 1. / ( 1. + exp(xt) ) ); + else if ( xt > p3 && xt < p4 ) return ( 1. / (2. + xt) ); - else if (xt > p4 - 1.e-200 && xt < p5) - return ( exp(-xt) / (1. + exp(-xt)) ); - else if (xt > p5 - 1.e-200 && xt < p6) + else if ( xt > p4 - 1.e-200 && xt < p5 ) + return ( exp(-xt) / ( 1. + exp(-xt) ) ); + else if ( xt > p5 - 1.e-200 && xt < p6 ) return ( exp(-xt) ); - else if (xt > p6 - 1.e-200) + else if ( xt > p6 - 1.e-200 ) return 0.; } -double swit2_(double wvnmb) +double swit2_( double wvnmb ) { - return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5) - / 1034.66 * pow(sin(wvnmb/65.), 2.); + return pow( ( 5.55243 * ( exp_n( -0.3 * wvnmb / 15.762 ) + - exp_n( -0.6 * wvnmb / 15.762 ) ) ) * wvnmb, 0.5 ) + / 1034.66 * pow( sin( wvnmb / 65. ), 2. ); } - -double GaussianQuad_N2(const double x1, const double x2) +double GaussianQuad_N2( const double x1, const double x2 ) { - double s=0.0; + double s = 0.0; double x[6], w[6]; //gauleg(a2, b2, x, w); double z1, z, xm, xl, pp, p3, p2, p1; - xm=0.5*(x2+x1); - xl=0.5*(x2-x1); - for(int i=1;i<=3;i++) + xm = 0.5 * ( x2 + x1 ); + xl = 0.5 * ( x2 - x1 ); + for( int i = 1; i <= 3; i++ ) { - z = (i == 1) ? 0.909632 : -0.0; - z = (i == 2) ? 0.540641 : z; - do + z = (i == 2) ? 0.540641 : ( (i == 1) ? 0.909632 : -0.0 ); + do { - p1 = z; - p2 = 1; - p3 = 0; - - p3=1; - p2=z; - p1=((3.0 * z * z) - 1) / 2; - - p3=p2; - p2=p1; - p1=((5.0 * z * p2) - (2.0 * z)) / 3; - - p3=p2; - p2=p1; - p1=((7.0 * z * p2) - (3.0 * p3)) / 4; - - p3=p2; - p2=p1; - p1=((9.0 * z * p2) - (4.0 * p3)) / 5; - - pp=5*(z*p1-p2)/(z*z-1.0); - z1=z; - z=z1-p1/pp; - } while (fabs(z-z1) > 3.0e-11); + p1 = ( ( 3.0 * z * z ) - 1 ) / 2; + p2 = p1; + p1 = ( ( 5.0 * z * p2 ) - ( 2.0 * z ) ) / 3; + p3 = p2; + p2 = p1; + p1 = ( ( 7.0 * z * p2 ) - ( 3.0 * p3 ) ) / 4; + p3 = p2; + p2 = p1; + p1 = ( ( 9.0 * z * p2 ) - ( 4.0 * p3 ) ) / 5; + pp = 5 * ( z * p1 - p2 ) / ( z * z - 1.0 ); + z1 = z; + z = z1 - p1 / pp; + } while ( fabs( z - z1 ) > 3.0e-11 ); - x[i]=xm-xl*z; - x[5+1-i]=xm+xl*z; - w[i]=2.0*xl/((1.0-z*z)*pp*pp); - w[5+1-i]=w[i]; + x[i] = xm - xl * z; + x[ 5+1-i ] = xm + xl * z; + w[i] = 2.0 * xl / ( ( 1.0 - z * z ) * pp * pp ); + w[ 5+1-i ] = w [i]; } - for(int j=1; j<=5; j++) s += w[j]*swit2_(x[j]); + for( int j = 1; j <= 5; j++ ) s += w[j] * swit2_( x[j] ); return s; } -uint32_t sw2_(int nnounce) +uint32_t sw2_( int nnounce ) { - double wmax = ((sqrt((double)(nnounce))*(1.+EPSa))/450+100); - return ((uint32_t)(GaussianQuad_N2(0., wmax)*(1.+EPSa)*1.e6)); + double wmax = ( ( sqrt( (double)(nnounce) ) * ( 1.+EPSa ) ) / 450+100 ); + return ( (uint32_t)( GaussianQuad_N2( 0., wmax ) * ( 1.+EPSa ) * 1.e6 ) ); } typedef struct { diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c index c5bdde3..2c5d561 100644 --- a/algo/quark/anime-4way.c +++ b/algo/quark/anime-4way.c @@ -49,7 +49,7 @@ void anime_4way_hash( void *state, const void *input ) __m256i* vhB = (__m256i*)vhashB; __m256i vh_mask; const uint32_t mask = 8; - const __m256i bit3_mask = _mm256_set1_epi64x( 8 ); + const __m256i bit3_mask = m256_const1_64( 8 ); const __m256i zero = _mm256_setzero_si256(); anime_4way_ctx_holder ctx; memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) ); diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 3645f19..101a5c2 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -21,7 +21,7 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" union _hmq1725_4way_context_overlay { @@ -57,7 +57,7 @@ extern void hmq1725_4way_hash(void *state, const void *input) uint32_t vhashB[32<<2] __attribute__ ((aligned (64))); hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64))); __m256i vh_mask; - const __m256i vmask = _mm256_set1_epi64x( 24 ); + const __m256i vmask = m256_const1_64( 24 ); const uint32_t mask = 24; __m256i* vh = (__m256i*)vhash; __m256i* vhA = (__m256i*)vhashA; diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index 9c0fb5d..ef0b7e5 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -49,7 +49,7 @@ void quark_4way_hash( void *state, const void *input ) __m256i* vhB = (__m256i*)vhashB; __m256i vh_mask; quark_4way_ctx_holder ctx; - const __m256i bit3_mask = _mm256_set1_epi64x( 8 ); + const __m256i bit3_mask = m256_const1_64( 8 ); const uint32_t mask = 8; const __m256i zero = _mm256_setzero_si256(); diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c index 9ac5f53..fd758a9 100644 --- a/algo/ripemd/lbry-4way.c +++ b/algo/ripemd/lbry-4way.c @@ -3,7 +3,7 @@ #include #include #include -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" #include "ripemd-hash-4way.h" #define LBRY_INPUT_SIZE 112 diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c index 046e36d..c8f5cc7 100644 --- a/algo/ripemd/ripemd-hash-4way.c +++ b/algo/ripemd/ripemd-hash-4way.c @@ -5,23 +5,26 @@ #include #include +/* static const uint32_t IV[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 }; +*/ /* * Round constants for RIPEMD-160. */ -#define K11 0x00000000 -#define K12 0x5A827999 -#define K13 0x6ED9EBA1 -#define K14 0x8F1BBCDC -#define K15 0xA953FD4E -#define K21 0x50A28BE6 -#define K22 0x5C4DD124 -#define K23 0x6D703EF3 -#define K24 0x7A6D76E9 -#define K25 0x00000000 +#define K11 0x0000000000000000 +#define K12 0x5A8279995A827999 +#define K13 0x6ED9EBA16ED9EBA1 +#define K14 0x8F1BBCDC8F1BBCDC +#define K15 0xA953FD4EA953FD4E + +#define K21 0x50A28BE650A28BE6 +#define K22 0x5C4DD1245C4DD124 +#define K23 0x6D703EF36D703EF3 +#define K24 0x7A6D76E97A6D76E9 +#define K25 0x0000000000000000 // RIPEMD-160 4 way @@ -44,7 +47,7 @@ static const uint32_t IV[5] = do{ \ a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \ _mm_add_epi32( a, f( b ,c, d ) ), r ), \ - _mm_set1_epi32( k ) ), s ), e ); \ + m128_const1_64( k ) ), s ), e ); \ c = mm128_rol_32( c, 10 );\ } while (0) @@ -248,11 +251,11 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc ) void ripemd160_4way_init( ripemd160_4way_context *sc ) { - sc->val[0] = _mm_set1_epi32( IV[0] ); - sc->val[1] = _mm_set1_epi32( IV[1] ); - sc->val[2] = _mm_set1_epi32( IV[2] ); - sc->val[3] = _mm_set1_epi32( IV[3] ); - sc->val[4] = _mm_set1_epi32( IV[4] ); + sc->val[0] = m128_const1_64( 0x6745230167452301 ); + sc->val[1] = m128_const1_64( 0xEFCDAB89EFCDAB89 ); + sc->val[2] = m128_const1_64( 0x98BADCFE98BADCFE ); + sc->val[3] = m128_const1_64( 0x1032547610325476 ); + sc->val[4] = m128_const1_64( 0xC3D2E1F0C3D2E1F0 ); sc->count_high = sc->count_low = 0; } @@ -343,7 +346,7 @@ void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst ) do{ \ a = _mm256_add_epi32( mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( \ _mm256_add_epi32( a, f( b ,c, d ) ), r ), \ - _mm256_set1_epi32( k ) ), s ), e ); \ + m256_const1_64( k ) ), s ), e ); \ c = mm256_rol_32( c, 10 );\ } while (0) @@ -548,11 +551,11 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc ) void ripemd160_8way_init( ripemd160_8way_context *sc ) { - sc->val[0] = _mm256_set1_epi32( IV[0] ); - sc->val[1] = _mm256_set1_epi32( IV[1] ); - sc->val[2] = _mm256_set1_epi32( IV[2] ); - sc->val[3] = _mm256_set1_epi32( IV[3] ); - sc->val[4] = _mm256_set1_epi32( IV[4] ); + sc->val[0] = m256_const1_64( 0x6745230167452301 ); + sc->val[1] = m256_const1_64( 0xEFCDAB89EFCDAB89 ); + sc->val[2] = m256_const1_64( 0x98BADCFE98BADCFE ); + sc->val[3] = m256_const1_64( 0x1032547610325476 ); + sc->val[4] = m256_const1_64( 0xC3D2E1F0C3D2E1F0 ); sc->count_high = sc->count_low = 0; } diff --git a/algo/sha/sha2-hash-4way.h b/algo/sha/sha-hash-4way.h similarity index 85% rename from algo/sha/sha2-hash-4way.h rename to algo/sha/sha-hash-4way.h index 8ec16f3..5be93d4 100644 --- a/algo/sha/sha2-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -55,32 +55,13 @@ typedef struct { __m128i buf[64>>2]; __m128i val[8]; uint32_t count_high, count_low; + bool initialized; } sha256_4way_context; void sha256_4way_init( sha256_4way_context *sc ); void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ); void sha256_4way_close( sha256_4way_context *sc, void *dst ); -/* -// SHA-256 7 way hybrid -// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel. -typedef struct { - __m128i bufx[64>>2]; - __m128i valx[8]; - __m64 bufy[64>>2]; - __m64 valy[8]; - uint32_t bufz[64>>2]; - uint32_t valz[8]; - uint32_t count_high, count_low; -} sha256_7way_context; - -void sha256_7way_init( sha256_7way_context *ctx ); -void sha256_7way( sha256_7way_context *ctx, const void *datax, - void *datay, void *dataz, size_t len ); -void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx, - void *dstz ); -*/ - #if defined (__AVX2__) // SHA-256 8 way @@ -89,6 +70,7 @@ typedef struct { __m256i buf[64>>2]; __m256i val[8]; uint32_t count_high, count_low; + bool initialized; } sha256_8way_context; void sha256_8way_init( sha256_8way_context *sc ); @@ -103,6 +85,7 @@ typedef struct { __m256i buf[128>>3]; __m256i val[8]; uint64_t count; + bool initialized; } sha512_4way_context; void sha512_4way_init( sha512_4way_context *sc); diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c index 5ab3ee8..30feeaf 100644 --- a/algo/sha/sha2.c +++ b/algo/sha/sha2.c @@ -12,6 +12,7 @@ #include #include +#include #if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) #define EXTERN_SHA256 @@ -197,7 +198,17 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) extern void sha256d(unsigned char *hash, const unsigned char *data, int len) { - uint32_t S[16], T[16]; +#if defined(__SHA__) + SHA256_CTX ctx; + SHA256_Init( &ctx ); + SHA256_Update( &ctx, data, len ); + SHA256_Final( (unsigned char*)hash, &ctx ); + SHA256_Init( &ctx ); + SHA256_Update( &ctx, hash, 32 ); + SHA256_Final( (unsigned char*)hash, &ctx ); +#else + + uint32_t S[16], T[16]; int i, r; sha256_init(S); @@ -218,6 +229,7 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len) sha256_transform(T, S, 0); for (i = 0; i < 8; i++) be32enc((uint32_t *)hash + i, T[i]); +#endif } static inline void sha256d_preextend(uint32_t *W) @@ -635,9 +647,46 @@ int scanhash_sha256d( struct work *work, return 0; } +int scanhash_SHA256d( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t _ALIGN(128) hash[8]; + uint32_t _ALIGN(64) data[20]; + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int thr_id = mythr->id; // thr_id arg is deprecated + + memcpy( data, pdata, 80 ); + + do { + data[19] = ++n; + sha256d( (unsigned char*)hash, (const unsigned char*)data, 80 ); + if ( unlikely( swab32( hash[7] ) <= Htarg ) ) + { + pdata[19] = n; + sha256d_80_swap(hash, pdata); + if ( fulltest( hash, ptarget ) && !opt_benchmark ) + submit_solution( work, hash, mythr ); + } + } while ( likely( n < max_nonce && !work_restart[thr_id].restart ) ); + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + + bool register_sha256d_algo( algo_gate_t* gate ) { - gate->scanhash = (void*)&scanhash_sha256d; +#if defined(__SHA__) + gate->optimizations = SHA_OPT; + gate->scanhash = (void*)&scanhash_SHA256d; +#else + gate->optimizations = SSE2_OPT | AVX2_OPT; + gate->scanhash = (void*)&scanhash_sha256d; +#endif gate->hash = (void*)&sha256d; return true; }; diff --git a/algo/sha/sha2-hash-4way.c b/algo/sha/sha256-hash-4way.c similarity index 63% rename from algo/sha/sha2-hash-4way.c rename to algo/sha/sha256-hash-4way.c index 9516543..ea1dc62 100644 --- a/algo/sha/sha2-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -34,19 +34,18 @@ #include #include - -#include "sha2-hash-4way.h" - -#include +#include "sha-hash-4way.h" // SHA-256 32 bit +/* static const sph_u32 H256[8] = { SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C), SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) }; +*/ static const sph_u32 K256[64] = { SPH_C32(0x428A2F98), SPH_C32(0x71374491), @@ -113,16 +112,17 @@ static const sph_u32 K256[64] = { #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ - register __m128i T1, T2; \ + __m128i T1, T2; \ + __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \ T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \ - _mm_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \ + K, W[i] ) ); \ T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \ D = _mm_add_epi32( D, T1 ); \ H = _mm_add_epi32( T1, T2 ); \ } while (0) static void -sha256_4way_round( __m128i *in, __m128i r[8] ) +sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] ) { register __m128i A, B, C, D, E, F, G, H; __m128i W[16]; @@ -130,14 +130,28 @@ sha256_4way_round( __m128i *in, __m128i r[8] ) mm128_block_bswap_32( W, in ); mm128_block_bswap_32( W+8, in+8 ); - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; + if ( ctx->initialized ) + { + A = r[0]; + B = r[1]; + C = r[2]; + D = r[3]; + E = r[4]; + F = r[5]; + G = r[6]; + H = r[7]; + } + else + { + A = m128_const1_64( 0x6A09E6676A09E667 ); + B = m128_const1_64( 0xBB67AE85BB67AE85 ); + C = m128_const1_64( 0x3C6EF3723C6EF372 ); + D = m128_const1_64( 0xA54FF53AA54FF53A ); + E = m128_const1_64( 0x510E527F510E527F ); + F = m128_const1_64( 0x9B05688C9B05688C ); + G = m128_const1_64( 0x1F83D9AB1F83D9AB ); + H = m128_const1_64( 0x5BE0CD195BE0CD19 ); + } SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); @@ -193,19 +207,36 @@ sha256_4way_round( __m128i *in, __m128i r[8] ) SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); } - r[0] = _mm_add_epi32( r[0], A ); - r[1] = _mm_add_epi32( r[1], B ); - r[2] = _mm_add_epi32( r[2], C ); - r[3] = _mm_add_epi32( r[3], D ); - r[4] = _mm_add_epi32( r[4], E ); - r[5] = _mm_add_epi32( r[5], F ); - r[6] = _mm_add_epi32( r[6], G ); - r[7] = _mm_add_epi32( r[7], H ); + if ( ctx->initialized ) + { + r[0] = _mm_add_epi32( r[0], A ); + r[1] = _mm_add_epi32( r[1], B ); + r[2] = _mm_add_epi32( r[2], C ); + r[3] = _mm_add_epi32( r[3], D ); + r[4] = _mm_add_epi32( r[4], E ); + r[5] = _mm_add_epi32( r[5], F ); + r[6] = _mm_add_epi32( r[6], G ); + r[7] = _mm_add_epi32( r[7], H ); + } + else + { + ctx->initialized = true; + r[0] = _mm_add_epi32( A, m128_const1_64( 0x6A09E6676A09E667 ) ); + r[1] = _mm_add_epi32( B, m128_const1_64( 0xBB67AE85BB67AE85 ) ); + r[2] = _mm_add_epi32( C, m128_const1_64( 0x3C6EF3723C6EF372 ) ); + r[3] = _mm_add_epi32( D, m128_const1_64( 0xA54FF53AA54FF53A ) ); + r[4] = _mm_add_epi32( E, m128_const1_64( 0x510E527F510E527F ) ); + r[5] = _mm_add_epi32( F, m128_const1_64( 0x9B05688C9B05688C ) ); + r[6] = _mm_add_epi32( G, m128_const1_64( 0x1F83D9AB1F83D9AB ) ); + r[7] = _mm_add_epi32( H, m128_const1_64( 0x5BE0CD195BE0CD19 ) ); + } } void sha256_4way_init( sha256_4way_context *sc ) { + sc->initialized = false; sc->count_high = sc->count_low = 0; +/* sc->val[0] = _mm_set1_epi32( H256[0] ); sc->val[1] = _mm_set1_epi32( H256[1] ); sc->val[2] = _mm_set1_epi32( H256[2] ); @@ -214,6 +245,7 @@ void sha256_4way_init( sha256_4way_context *sc ) sc->val[5] = _mm_set1_epi32( H256[5] ); sc->val[6] = _mm_set1_epi32( H256[6] ); sc->val[7] = _mm_set1_epi32( H256[7] ); +*/ } void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ) @@ -237,7 +269,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len ) len -= clen; if ( ptr == buf_size ) { - sha256_4way_round( sc->buf, sc->val ); + sha256_4way_round( sc, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -256,13 +288,13 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) const int pad = buf_size - 8; ptr = (unsigned)sc->count_low & (buf_size - 1U); - sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 ); + sc->buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 ); ptr += 4; if ( ptr > pad ) { memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_4way_round( sc->buf, sc->val ); + sha256_4way_round( sc, sc->buf, sc->val ); memset_zero_128( sc->buf, pad >> 2 ); } else @@ -276,7 +308,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) mm128_bswap_32( _mm_set1_epi32( high ) ); sc->buf[ ( pad+4 ) >> 2 ] = mm128_bswap_32( _mm_set1_epi32( low ) ); - sha256_4way_round( sc->buf, sc->val ); + sha256_4way_round( sc, sc->buf, sc->val ); mm128_block_bswap_32( dst, sc->val ); } @@ -313,16 +345,17 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) #define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ - register __m256i T1, T2; \ - T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ - _mm256_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \ + __m256i T1, T2; \ + __m256i K = _mm256_set1_epi32( K256[( (j)+(i) )] ); \ + T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \ + K, W[i] ) ); \ T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ D = _mm256_add_epi32( D, T1 ); \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) static void -sha256_8way_round( __m256i *in, __m256i r[8] ) +sha256_8way_round( sha256_8way_context *ctx, __m256i *in, __m256i r[8] ) { register __m256i A, B, C, D, E, F, G, H; __m256i W[16]; @@ -330,14 +363,28 @@ sha256_8way_round( __m256i *in, __m256i r[8] ) mm256_block_bswap_32( W , in ); mm256_block_bswap_32( W+8, in+8 ); - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; + if ( ctx->initialized ) + { + A = r[0]; + B = r[1]; + C = r[2]; + D = r[3]; + E = r[4]; + F = r[5]; + G = r[6]; + H = r[7]; + } + else + { + A = m256_const1_64( 0x6A09E6676A09E667 ); + B = m256_const1_64( 0xBB67AE85BB67AE85 ); + C = m256_const1_64( 0x3C6EF3723C6EF372 ); + D = m256_const1_64( 0xA54FF53AA54FF53A ); + E = m256_const1_64( 0x510E527F510E527F ); + F = m256_const1_64( 0x9B05688C9B05688C ); + G = m256_const1_64( 0x1F83D9AB1F83D9AB ); + H = m256_const1_64( 0x5BE0CD195BE0CD19 ); + } SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); @@ -393,20 +440,36 @@ sha256_8way_round( __m256i *in, __m256i r[8] ) SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); } - r[0] = _mm256_add_epi32( r[0], A ); - r[1] = _mm256_add_epi32( r[1], B ); - r[2] = _mm256_add_epi32( r[2], C ); - r[3] = _mm256_add_epi32( r[3], D ); - r[4] = _mm256_add_epi32( r[4], E ); - r[5] = _mm256_add_epi32( r[5], F ); - r[6] = _mm256_add_epi32( r[6], G ); - r[7] = _mm256_add_epi32( r[7], H ); + if ( ctx->initialized ) + { + r[0] = _mm256_add_epi32( r[0], A ); + r[1] = _mm256_add_epi32( r[1], B ); + r[2] = _mm256_add_epi32( r[2], C ); + r[3] = _mm256_add_epi32( r[3], D ); + r[4] = _mm256_add_epi32( r[4], E ); + r[5] = _mm256_add_epi32( r[5], F ); + r[6] = _mm256_add_epi32( r[6], G ); + r[7] = _mm256_add_epi32( r[7], H ); + } + else + { + ctx->initialized = true; + r[0] = _mm256_add_epi32( A, m256_const1_64( 0x6A09E6676A09E667 ) ); + r[1] = _mm256_add_epi32( B, m256_const1_64( 0xBB67AE85BB67AE85 ) ); + r[2] = _mm256_add_epi32( C, m256_const1_64( 0x3C6EF3723C6EF372 ) ); + r[3] = _mm256_add_epi32( D, m256_const1_64( 0xA54FF53AA54FF53A ) ); + r[4] = _mm256_add_epi32( E, m256_const1_64( 0x510E527F510E527F ) ); + r[5] = _mm256_add_epi32( F, m256_const1_64( 0x9B05688C9B05688C ) ); + r[6] = _mm256_add_epi32( G, m256_const1_64( 0x1F83D9AB1F83D9AB ) ); + r[7] = _mm256_add_epi32( H, m256_const1_64( 0x5BE0CD195BE0CD19 ) ); + } } - void sha256_8way_init( sha256_8way_context *sc ) { + sc->initialized = false; sc->count_high = sc->count_low = 0; +/* sc->val[0] = _mm256_set1_epi32( H256[0] ); sc->val[1] = _mm256_set1_epi32( H256[1] ); sc->val[2] = _mm256_set1_epi32( H256[2] ); @@ -415,6 +478,7 @@ void sha256_8way_init( sha256_8way_context *sc ) sc->val[5] = _mm256_set1_epi32( H256[5] ); sc->val[6] = _mm256_set1_epi32( H256[6] ); sc->val[7] = _mm256_set1_epi32( H256[7] ); +*/ } void sha256_8way( sha256_8way_context *sc, const void *data, size_t len ) @@ -438,7 +502,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len ) len -= clen; if ( ptr == buf_size ) { - sha256_8way_round( sc->buf, sc->val ); + sha256_8way_round( sc, sc->buf, sc->val ); ptr = 0; } clow = sc->count_low; @@ -457,13 +521,13 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) const int pad = buf_size - 8; ptr = (unsigned)sc->count_low & (buf_size - 1U); - sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 ); + sc->buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 ); ptr += 4; if ( ptr > pad ) { memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); - sha256_8way_round( sc->buf, sc->val ); + sha256_8way_round( sc, sc->buf, sc->val ); memset_zero_256( sc->buf, pad >> 2 ); } else @@ -478,207 +542,10 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst ) sc->buf[ ( pad+4 ) >> 2 ] = mm256_bswap_32( _mm256_set1_epi32( low ) ); - sha256_8way_round( sc->buf, sc->val ); + sha256_8way_round( sc, sc->buf, sc->val ); mm256_block_bswap_32( dst, sc->val ); } - -// SHA-512 4 way 64 bit - -static const sph_u64 H512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) -}; - -static const sph_u64 K512[80] = { - SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), - SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), - SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), - SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118), - SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE), - SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2), - SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1), - SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694), - SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3), - SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65), - SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483), - SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5), - SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210), - SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4), - SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725), - SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70), - SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926), - SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF), - SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8), - SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B), - SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001), - SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30), - SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910), - SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8), - SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53), - SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8), - SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB), - SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3), - SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60), - SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC), - SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9), - SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B), - SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207), - SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178), - SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6), - SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B), - SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493), - SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C), - SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A), - SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) -}; - -#define CH(X, Y, Z) \ - _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) - -#define MAJ(X, Y, Z) \ - _mm256_or_si256( _mm256_and_si256( X, Y ), \ - _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) - -#define BSG5_0(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) ) - -#define BSG5_1(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) ) - -#define SSG5_0(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) ) - -#define SSG5_1(x) \ - _mm256_xor_si256( _mm256_xor_si256( \ - mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) ) - -#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ -do { \ - register __m256i T1, T2; \ - T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \ - _mm256_set1_epi64x( K512[i] ), W[i] ) ); \ - T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ - D = _mm256_add_epi64( D, T1 ); \ - H = _mm256_add_epi64( T1, T2 ); \ -} while (0) - -static void -sha512_4way_round( __m256i *in, __m256i r[8] ) -{ - int i; - register __m256i A, B, C, D, E, F, G, H; - __m256i W[80]; - - mm256_block_bswap_64( W , in ); - mm256_block_bswap_64( W+8, in+8 ); - - for ( i = 16; i < 80; i++ ) - W[i] = mm256_add4_64( SSG5_1( W[ i- 2 ] ), W[ i- 7 ], - SSG5_0( W[ i-15 ] ), W[ i-16 ] ); - - A = r[0]; - B = r[1]; - C = r[2]; - D = r[3]; - E = r[4]; - F = r[5]; - G = r[6]; - H = r[7]; - - for ( i = 0; i < 80; i += 8 ) - { - SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); - SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 ); - SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 ); - SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 ); - SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 ); - SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 ); - SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 ); - SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 ); - } - - r[0] = _mm256_add_epi64( r[0], A ); - r[1] = _mm256_add_epi64( r[1], B ); - r[2] = _mm256_add_epi64( r[2], C ); - r[3] = _mm256_add_epi64( r[3], D ); - r[4] = _mm256_add_epi64( r[4], E ); - r[5] = _mm256_add_epi64( r[5], F ); - r[6] = _mm256_add_epi64( r[6], G ); - r[7] = _mm256_add_epi64( r[7], H ); -} - -void sha512_4way_init( sha512_4way_context *sc ) -{ - sc->count = 0; - sc->val[0] = _mm256_set1_epi64x( H512[0] ); - sc->val[1] = _mm256_set1_epi64x( H512[1] ); - sc->val[2] = _mm256_set1_epi64x( H512[2] ); - sc->val[3] = _mm256_set1_epi64x( H512[3] ); - sc->val[4] = _mm256_set1_epi64x( H512[4] ); - sc->val[5] = _mm256_set1_epi64x( H512[5] ); - sc->val[6] = _mm256_set1_epi64x( H512[6] ); - sc->val[7] = _mm256_set1_epi64x( H512[7] ); -} - -void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ) -{ - __m256i *vdata = (__m256i*)data; - size_t ptr; - const int buf_size = 128; - - ptr = (unsigned)sc->count & (buf_size - 1U); - while ( len > 0 ) - { - size_t clen; - clen = buf_size - ptr; - if ( clen > len ) - clen = len; - memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 ); - vdata = vdata + (clen>>3); - ptr += clen; - len -= clen; - if ( ptr == buf_size ) - { - sha512_4way_round( sc->buf, sc->val ); - ptr = 0; - } - sc->count += clen; - } -} - -void sha512_4way_close( sha512_4way_context *sc, void *dst ) -{ - unsigned ptr; - const int buf_size = 128; - const int pad = buf_size - 16; - - ptr = (unsigned)sc->count & (buf_size - 1U); - sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 ); - ptr += 8; - if ( ptr > pad ) - { - memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 ); - sha512_4way_round( sc->buf, sc->val ); - memset_zero_256( sc->buf, pad >> 3 ); - } - else - memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); - - sc->buf[ pad >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); - sc->buf[ ( pad+8 ) >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); - sha512_4way_round( sc->buf, sc->val ); - - mm256_block_bswap_64( dst, sc->val ); -} - #endif // __AVX2__ #endif // __SSE2__ diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c index cc47b1c..41c3458 100644 --- a/algo/sha/sha256q-4way.c +++ b/algo/sha/sha256q-4way.c @@ -3,7 +3,7 @@ #include #include #include -#include "sha2-hash-4way.h" +#include "sha-hash-4way.h" #if defined(SHA256T_8WAY) diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index c17ea1d..7078d19 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -3,7 +3,7 @@ #include #include #include -#include "sha2-hash-4way.h" +#include "sha-hash-4way.h" #if defined(SHA256T_11WAY) @@ -158,7 +158,7 @@ void sha256t_8way_hash( void* output, const void* input ) sha256_8way_close( &ctx, output ); } -int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, +int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t vdata[20*8] __attribute__ ((aligned (64))); @@ -166,12 +166,12 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash7 = &(hash[7<<3]); uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + const uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; __m256i *noncev = (__m256i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + const int thr_id = mythr->id; const uint64_t htmax[] = { 0, 0xF, @@ -194,7 +194,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce, for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { - uint32_t mask = masks[m]; + const uint32_t mask = masks[m]; do { *noncev = mm256_bswap_32( _mm256_set_epi32( @@ -244,7 +244,7 @@ void sha256t_4way_hash( void* output, const void* input ) sha256_4way_close( &ctx, output ); } -int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, +int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { uint32_t vdata[20*4] __attribute__ ((aligned (64))); @@ -252,12 +252,12 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, uint32_t lane_hash[8] __attribute__ ((aligned (64))); uint32_t *hash7 = &(hash[7<<2]); uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + const uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; __m128i *noncev = (__m128i*)vdata + 19; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + const int thr_id = mythr->id; const uint64_t htmax[] = { 0, 0xF, @@ -278,7 +278,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce, for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { - uint32_t mask = masks[m]; + const uint32_t mask = masks[m]; do { *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); pdata[19] = n; diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c new file mode 100644 index 0000000..e333991 --- /dev/null +++ b/algo/sha/sha512-hash-4way.c @@ -0,0 +1,320 @@ +/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */ +/* + * SHA-384 / SHA-512 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#if defined(__AVX2__) + +#include +#include +#include "sha-hash-4way.h" + +// SHA-512 4 way 64 bit + +/* +static const sph_u64 H512[8] = { + SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), + SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), + SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), + SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +}; +*/ + +static const sph_u64 K512[80] = { + SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), + SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), + SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), + SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118), + SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE), + SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2), + SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1), + SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694), + SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3), + SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65), + SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483), + SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5), + SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210), + SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4), + SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725), + SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70), + SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926), + SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF), + SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8), + SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B), + SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001), + SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30), + SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910), + SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8), + SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53), + SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8), + SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB), + SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3), + SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60), + SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC), + SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9), + SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B), + SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207), + SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178), + SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6), + SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B), + SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493), + SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C), + SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A), + SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) +}; + +#define CH(X, Y, Z) \ + _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) + +#define MAJ(X, Y, Z) \ + _mm256_or_si256( _mm256_and_si256( X, Y ), \ + _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) + +#define BSG5_0(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) ) + +#define BSG5_1(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) ) + +#define SSG5_0(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_ror_64(x, 1), mm256_ror_64(x, 8) ), _mm256_srli_epi64(x, 7) ) + +#define SSG5_1(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) ) + +// Interleave SSG0 & SSG1 for better throughput. +// return ssg0(w0) + ssg1(w1) +static inline __m256i ssg512_add( __m256i w0, __m256i w1 ) +{ + __m256i w0a, w1a, w0b, w1b; + w0a = mm256_ror_64( w0, 1 ); + w1a = mm256_ror_64( w1,19 ); + w0b = mm256_ror_64( w0, 8 ); + w1b = mm256_ror_64( w1,61 ); + w0a = _mm256_xor_si256( w0a, w0b ); + w1a = _mm256_xor_si256( w1a, w1b ); + w0b = _mm256_srli_epi64( w0, 7 ); + w1b = _mm256_srli_epi64( w1, 6 ); + w0a = _mm256_xor_si256( w0a, w0b ); + w1a = _mm256_xor_si256( w1a, w1b ); + return _mm256_add_epi64( w0a, w1a ); +} + + +#define SSG512x2_0( w0, w1, i ) do \ +{ \ + __m256i X0a, X1a, X0b, X1b; \ + X0a = mm256_ror_64( W[i-15], 1 ); \ + X1a = mm256_ror_64( W[i-14], 1 ); \ + X0b = mm256_ror_64( W[i-15], 8 ); \ + X1b = mm256_ror_64( W[i-14], 8 ); \ + X0a = _mm256_xor_si256( X0a, X0b ); \ + X1a = _mm256_xor_si256( X1a, X1b ); \ + X0b = _mm256_srli_epi64( W[i-15], 7 ); \ + X1b = _mm256_srli_epi64( W[i-14], 7 ); \ + w0 = _mm256_xor_si256( X0a, X0b ); \ + w1 = _mm256_xor_si256( X1a, X1b ); \ +} while(0) + +#define SSG512x2_1( w0, w1, i ) do \ +{ \ + __m256i X0a, X1a, X0b, X1b; \ + X0a = mm256_ror_64( W[i-2],19 ); \ + X1a = mm256_ror_64( W[i-1],19 ); \ + X0b = mm256_ror_64( W[i-2],61 ); \ + X1b = mm256_ror_64( W[i-1],61 ); \ + X0a = _mm256_xor_si256( X0a, X0b ); \ + X1a = _mm256_xor_si256( X1a, X1b ); \ + X0b = _mm256_srli_epi64( W[i-2], 6 ); \ + X1b = _mm256_srli_epi64( W[i-1], 6 ); \ + w0 = _mm256_xor_si256( X0a, X0b ); \ + w1 = _mm256_xor_si256( X1a, X1b ); \ +} while(0) + +#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ +do { \ + __m256i T1, T2; \ + __m256i K = _mm256_set1_epi64x( K512[ i ] ); \ + T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \ + K, W[i] ) ); \ + T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ + D = _mm256_add_epi64( D, T1 ); \ + H = _mm256_add_epi64( T1, T2 ); \ +} while (0) + + +static void +sha512_4way_round( sha512_4way_context *ctx, __m256i *in, __m256i r[8] ) +{ + int i; + register __m256i A, B, C, D, E, F, G, H; + __m256i W[80]; + + mm256_block_bswap_64( W , in ); + mm256_block_bswap_64( W+8, in+8 ); + + for ( i = 16; i < 80; i++ ) + W[i] = _mm256_add_epi64( ssg512_add( W[i-15], W[i-2] ), + _mm256_add_epi64( W[ i- 7 ], W[ i-16 ] ) ); + + if ( ctx->initialized ) + { + A = r[0]; + B = r[1]; + C = r[2]; + D = r[3]; + E = r[4]; + F = r[5]; + G = r[6]; + H = r[7]; + } + else + { + A = m256_const1_64( 0x6A09E667F3BCC908 ); + B = m256_const1_64( 0xBB67AE8584CAA73B ); + C = m256_const1_64( 0x3C6EF372FE94F82B ); + D = m256_const1_64( 0xA54FF53A5F1D36F1 ); + E = m256_const1_64( 0x510E527FADE682D1 ); + F = m256_const1_64( 0x9B05688C2B3E6C1F ); + G = m256_const1_64( 0x1F83D9ABFB41BD6B ); + H = m256_const1_64( 0x5BE0CD19137E2179 ); + } + + for ( i = 0; i < 80; i += 8 ) + { + SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); + SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 ); + SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 ); + SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 ); + SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 ); + SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 ); + SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 ); + SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 ); + } + + if ( ctx->initialized ) + { + r[0] = _mm256_add_epi64( r[0], A ); + r[1] = _mm256_add_epi64( r[1], B ); + r[2] = _mm256_add_epi64( r[2], C ); + r[3] = _mm256_add_epi64( r[3], D ); + r[4] = _mm256_add_epi64( r[4], E ); + r[5] = _mm256_add_epi64( r[5], F ); + r[6] = _mm256_add_epi64( r[6], G ); + r[7] = _mm256_add_epi64( r[7], H ); + } + else + { + ctx->initialized = true; + r[0] = _mm256_add_epi64( A, m256_const1_64( 0x6A09E667F3BCC908 ) ); + r[1] = _mm256_add_epi64( B, m256_const1_64( 0xBB67AE8584CAA73B ) ); + r[2] = _mm256_add_epi64( C, m256_const1_64( 0x3C6EF372FE94F82B ) ); + r[3] = _mm256_add_epi64( D, m256_const1_64( 0xA54FF53A5F1D36F1 ) ); + r[4] = _mm256_add_epi64( E, m256_const1_64( 0x510E527FADE682D1 ) ); + r[5] = _mm256_add_epi64( F, m256_const1_64( 0x9B05688C2B3E6C1F ) ); + r[6] = _mm256_add_epi64( G, m256_const1_64( 0x1F83D9ABFB41BD6B ) ); + r[7] = _mm256_add_epi64( H, m256_const1_64( 0x5BE0CD19137E2179 ) ); + } +} + +void sha512_4way_init( sha512_4way_context *sc ) +{ + sc->initialized = false; + sc->count = 0; +/* + sc->val[0] = _mm256_set1_epi64x( H512[0] ); + sc->val[1] = _mm256_set1_epi64x( H512[1] ); + sc->val[2] = _mm256_set1_epi64x( H512[2] ); + sc->val[3] = _mm256_set1_epi64x( H512[3] ); + sc->val[4] = _mm256_set1_epi64x( H512[4] ); + sc->val[5] = _mm256_set1_epi64x( H512[5] ); + sc->val[6] = _mm256_set1_epi64x( H512[6] ); + sc->val[7] = _mm256_set1_epi64x( H512[7] ); +*/ +} + +void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ) +{ + __m256i *vdata = (__m256i*)data; + size_t ptr; + const int buf_size = 128; + + ptr = (unsigned)sc->count & (buf_size - 1U); + while ( len > 0 ) + { + size_t clen; + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 ); + vdata = vdata + (clen>>3); + ptr += clen; + len -= clen; + if ( ptr == buf_size ) + { + sha512_4way_round( sc, sc->buf, sc->val ); + ptr = 0; + } + sc->count += clen; + } +} + +void sha512_4way_close( sha512_4way_context *sc, void *dst ) +{ + unsigned ptr; + const int buf_size = 128; + const int pad = buf_size - 16; + + ptr = (unsigned)sc->count & (buf_size - 1U); + sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 ); + ptr += 8; + if ( ptr > pad ) + { + memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 ); + sha512_4way_round( sc, sc->buf, sc->val ); + memset_zero_256( sc->buf, pad >> 3 ); + } + else + memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); + + sc->buf[ pad >> 3 ] = + mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + sc->buf[ ( pad+8 ) >> 3 ] = + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + sha512_4way_round( sc, sc->buf, sc->val ); + + mm256_block_bswap_64( dst, sc->val ); +} + +#endif // __AVX2__ diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index 383e936..b84246b 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -63,7 +63,6 @@ extern "C"{ * that it can optimize them at will. */ -/* BEGIN -- automatically generated code. */ #define DECL_STATE \ __m128i A00, A01, A02, A03, A04, A05, A06, A07, \ @@ -76,8 +75,11 @@ extern "C"{ M8, M9, MA, MB, MC, MD, ME, MF; \ sph_u32 Wlow, Whigh; -#define READ_STATE(state) do { \ - A00 = (state)->A[0]; \ +#define READ_STATE(state) do \ +{ \ + if ( (state)->state_loaded ) \ + { \ + A00 = (state)->A[0]; \ A01 = (state)->A[1]; \ A02 = (state)->A[2]; \ A03 = (state)->A[3]; \ @@ -121,9 +123,58 @@ extern "C"{ CD = (state)->C[13]; \ CE = (state)->C[14]; \ CF = (state)->C[15]; \ - Wlow = (state)->Wlow; \ - Whigh = (state)->Whigh; \ - } while (0) + } \ + else \ + { \ + (state)->state_loaded = true; \ + A00 = m128_const1_64( 0x20728DFD20728DFD ); \ + A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \ + A02 = m128_const1_64( 0xE782B699E782B699 ); \ + A03 = m128_const1_64( 0x5530463255304632 ); \ + A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \ + A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \ + A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \ + A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \ + A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \ + A09 = m128_const1_64( 0x8BD144108BD14410 ); \ + A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \ + A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \ + B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \ + B1 = m128_const1_64( 0x07B385F307B385F3 ); \ + B2 = m128_const1_64( 0xE7442C26E7442C26 ); \ + B3 = m128_const1_64( 0xCC8AD640CC8AD640 ); \ + B4 = m128_const1_64( 0xEB6F56C7EB6F56C7 ); \ + B5 = m128_const1_64( 0x1EA81AA91EA81AA9 ); \ + B6 = m128_const1_64( 0x73B9D31473B9D314 ); \ + B7 = m128_const1_64( 0x1DE85D081DE85D08 ); \ + B8 = m128_const1_64( 0x48910A5A48910A5A ); \ + B9 = m128_const1_64( 0x893B22DB893B22DB ); \ + BA = m128_const1_64( 0xC5A0DF44C5A0DF44 ); \ + BB = m128_const1_64( 0xBBC4324EBBC4324E ); \ + BC = m128_const1_64( 0x72D2F24072D2F240 ); \ + BD = m128_const1_64( 0x75941D9975941D99 ); \ + BE = m128_const1_64( 0x6D8BDE826D8BDE82 ); \ + BF = m128_const1_64( 0xA1A7502BA1A7502B ); \ + C0 = m128_const1_64( 0xD9BF68D1D9BF68D1 ); \ + C1 = m128_const1_64( 0x58BAD75058BAD750 ); \ + C2 = m128_const1_64( 0x56028CB256028CB2 ); \ + C3 = m128_const1_64( 0x8134F3598134F359 ); \ + C4 = m128_const1_64( 0xB5D469D8B5D469D8 ); \ + C5 = m128_const1_64( 0x941A8CC2941A8CC2 ); \ + C6 = m128_const1_64( 0x418B2A6E418B2A6E ); \ + C7 = m128_const1_64( 0x0405278004052780 ); \ + C8 = m128_const1_64( 0x7F07D7877F07D787 ); \ + C9 = m128_const1_64( 0x5194358F5194358F ); \ + CA = m128_const1_64( 0x3C60D6653C60D665 ); \ + CB = m128_const1_64( 0xBE97D79ABE97D79A ); \ + CC = m128_const1_64( 0x950C3434950C3434 ); \ + CD = m128_const1_64( 0xAED9A06DAED9A06D ); \ + CE = m128_const1_64( 0x2537DC8D2537DC8D ); \ + CF = m128_const1_64( 0x7CDB59697CDB5969 ); \ + } \ + Wlow = (state)->Wlow; \ + Whigh = (state)->Whigh; \ +} while (0) #define WRITE_STATE(state) do { \ (state)->A[0] = A00; \ @@ -397,6 +448,7 @@ do { \ Whigh = T32(Whigh + 1); \ } while (0) +/* static const sph_u32 A_init_256[] = { C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191), C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C), @@ -436,33 +488,115 @@ static const sph_u32 C_init_512[] = { C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A), C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969) }; +*/ static void shabal_4way_init( void *cc, unsigned size ) { shabal_4way_context *sc = (shabal_4way_context*)cc; - int i; if ( size == 512 ) - { - for ( i = 0; i < 12; i++ ) - sc->A[i] = _mm_set1_epi32( A_init_512[i] ); - for ( i = 0; i < 16; i++ ) - { - sc->B[i] = _mm_set1_epi32( B_init_512[i] ); - sc->C[i] = _mm_set1_epi32( C_init_512[i] ); - } + { // copy immediate constants directly to working registers later. + sc->state_loaded = false; +/* + sc->A[ 0] = m128_const1_64( 0x20728DFD20728DFD ); + sc->A[ 1] = m128_const1_64( 0x46C0BD5346C0BD53 ); + sc->A[ 2] = m128_const1_64( 0xE782B699E782B699 ); + sc->A[ 3] = m128_const1_64( 0x5530463255304632 ); + sc->A[ 4] = m128_const1_64( 0x71B4EF9071B4EF90 ); + sc->A[ 5] = m128_const1_64( 0x0EA9E82C0EA9E82C ); + sc->A[ 6] = m128_const1_64( 0xDBB930F1DBB930F1 ); + sc->A[ 7] = m128_const1_64( 0xFAD06B8BFAD06B8B ); + sc->A[ 8] = m128_const1_64( 0xBE0CAE40BE0CAE40 ); + sc->A[ 9] = m128_const1_64( 0x8BD144108BD14410 ); + sc->A[10] = m128_const1_64( 0x76D2ADAC76D2ADAC ); + sc->A[11] = m128_const1_64( 0x28ACAB7F28ACAB7F ); + + sc->B[ 0] = m128_const1_64( 0xC1099CB7C1099CB7 ); + sc->B[ 1] = m128_const1_64( 0x07B385F307B385F3 ); + sc->B[ 2] = m128_const1_64( 0xE7442C26E7442C26 ); + sc->B[ 3] = m128_const1_64( 0xCC8AD640CC8AD640 ); + sc->B[ 4] = m128_const1_64( 0xEB6F56C7EB6F56C7 ); + sc->B[ 5] = m128_const1_64( 0x1EA81AA91EA81AA9 ); + sc->B[ 6] = m128_const1_64( 0x73B9D31473B9D314 ); + sc->B[ 7] = m128_const1_64( 0x1DE85D081DE85D08 ); + sc->B[ 8] = m128_const1_64( 0x48910A5A48910A5A ); + sc->B[ 9] = m128_const1_64( 0x893B22DB893B22DB ); + sc->B[10] = m128_const1_64( 0xC5A0DF44C5A0DF44 ); + sc->B[11] = m128_const1_64( 0xBBC4324EBBC4324E ); + sc->B[12] = m128_const1_64( 0x72D2F24072D2F240 ); + sc->B[13] = m128_const1_64( 0x75941D9975941D99 ); + sc->B[14] = m128_const1_64( 0x6D8BDE826D8BDE82 ); + sc->B[15] = m128_const1_64( 0xA1A7502BA1A7502B ); + + sc->C[ 0] = m128_const1_64( 0xD9BF68D1D9BF68D1 ); + sc->C[ 1] = m128_const1_64( 0x58BAD75058BAD750 ); + sc->C[ 2] = m128_const1_64( 0x56028CB256028CB2 ); + sc->C[ 3] = m128_const1_64( 0x8134F3598134F359 ); + sc->C[ 4] = m128_const1_64( 0xB5D469D8B5D469D8 ); + sc->C[ 5] = m128_const1_64( 0x941A8CC2941A8CC2 ); + sc->C[ 6] = m128_const1_64( 0x418B2A6E418B2A6E ); + sc->C[ 7] = m128_const1_64( 0x0405278004052780 ); + sc->C[ 8] = m128_const1_64( 0x7F07D7877F07D787 ); + sc->C[ 9] = m128_const1_64( 0x5194358F5194358F ); + sc->C[10] = m128_const1_64( 0x3C60D6653C60D665 ); + sc->C[11] = m128_const1_64( 0xBE97D79ABE97D79A ); + sc->C[12] = m128_const1_64( 0x950C3434950C3434 ); + sc->C[13] = m128_const1_64( 0xAED9A06DAED9A06D ); + sc->C[14] = m128_const1_64( 0x2537DC8D2537DC8D ); + sc->C[15] = m128_const1_64( 0x7CDB59697CDB5969 ); +*/ } else - { - for ( i = 0; i < 12; i++ ) - sc->A[i] = _mm_set1_epi32( A_init_256[i] ); - for ( i = 0; i < 16; i++ ) - { - sc->B[i] = _mm_set1_epi32( B_init_256[i] ); - sc->C[i] = _mm_set1_epi32( C_init_256[i] ); - } - } + { // No users + sc->state_loaded = true; + sc->A[ 0] = m128_const1_64( 0x52F8455252F84552 ); + sc->A[ 1] = m128_const1_64( 0xE54B7999E54B7999 ); + sc->A[ 2] = m128_const1_64( 0x2D8EE3EC2D8EE3EC ); + sc->A[ 3] = m128_const1_64( 0xB9645191B9645191 ); + sc->A[ 4] = m128_const1_64( 0xE0078B86E0078B86 ); + sc->A[ 5] = m128_const1_64( 0xBB7C44C9BB7C44C9 ); + sc->A[ 6] = m128_const1_64( 0xD2B5C1CAD2B5C1CA ); + sc->A[ 7] = m128_const1_64( 0xB0D2EB8CB0D2EB8C ); + sc->A[ 8] = m128_const1_64( 0x14CE5A4514CE5A45 ); + sc->A[ 9] = m128_const1_64( 0x22AF50DC22AF50DC ); + sc->A[10] = m128_const1_64( 0xEFFDBC6BEFFDBC6B ); + sc->A[11] = m128_const1_64( 0xEB21B74AEB21B74A ); + + sc->B[ 0] = m128_const1_64( 0xB555C6EEB555C6EE ); + sc->B[ 1] = m128_const1_64( 0x3E7105963E710596 ); + sc->B[ 2] = m128_const1_64( 0xA72A652FA72A652F ); + sc->B[ 3] = m128_const1_64( 0x9301515F9301515F ); + sc->B[ 4] = m128_const1_64( 0xDA28C1FADA28C1FA ); + sc->B[ 5] = m128_const1_64( 0x696FD868696FD868 ); + sc->B[ 6] = m128_const1_64( 0x9CB6BF729CB6BF72 ); + sc->B[ 7] = m128_const1_64( 0x0AFE40020AFE4002 ); + sc->B[ 8] = m128_const1_64( 0xA6E03615A6E03615 ); + sc->B[ 9] = m128_const1_64( 0x5138C1D45138C1D4 ); + sc->B[10] = m128_const1_64( 0xBE216306BE216306 ); + sc->B[11] = m128_const1_64( 0xB38B8890B38B8890 ); + sc->B[12] = m128_const1_64( 0x3EA8B96B3EA8B96B ); + sc->B[13] = m128_const1_64( 0x3299ACE43299ACE4 ); + sc->B[14] = m128_const1_64( 0x30924DD430924DD4 ); + sc->B[15] = m128_const1_64( 0x55CB34A555CB34A5 ); + + sc->C[ 0] = m128_const1_64( 0xB405F031B405F031 ); + sc->C[ 1] = m128_const1_64( 0xC4233EBAC4233EBA ); + sc->C[ 2] = m128_const1_64( 0xB3733979B3733979 ); + sc->C[ 3] = m128_const1_64( 0xC0DD9D55C0DD9D55 ); + sc->C[ 4] = m128_const1_64( 0xC51C28AEC51C28AE ); + sc->C[ 5] = m128_const1_64( 0xA327B8E1A327B8E1 ); + sc->C[ 6] = m128_const1_64( 0x56C5616756C56167 ); + sc->C[ 7] = m128_const1_64( 0xED614433ED614433 ); + sc->C[ 8] = m128_const1_64( 0x88B59D6088B59D60 ); + sc->C[ 9] = m128_const1_64( 0x60E2CEBA60E2CEBA ); + sc->C[10] = m128_const1_64( 0x758B4B8B758B4B8B ); + sc->C[11] = m128_const1_64( 0x83E82A7F83E82A7F ); + sc->C[12] = m128_const1_64( 0xBC968828BC968828 ); + sc->C[13] = m128_const1_64( 0xE6E00BF7E6E00BF7 ); + sc->C[14] = m128_const1_64( 0xBA839E55BA839E55 ); + sc->C[15] = m128_const1_64( 0x9B491C609B491C60 ); + } sc->Wlow = 1; sc->Whigh = 0; sc->ptr = 0; @@ -488,6 +622,8 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len ) sc->ptr = ptr; return; } + + READ_STATE(sc); while ( len > 0 ) diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index dbdfe2b..bf54b59 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -54,7 +54,8 @@ typedef struct { __m128i buf[16] __attribute__ ((aligned (64))); __m128i A[12], B[16], C[16]; sph_u32 Whigh, Wlow; - size_t ptr; + size_t ptr; + bool state_loaded; } shabal_4way_context; typedef shabal_4way_context shabal256_4way_context; diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c index d600e60..447cb3a 100644 --- a/algo/skein/skein-4way.c +++ b/algo/skein/skein-4way.c @@ -5,7 +5,7 @@ #if defined(__SHA__) #include #else - #include "algo/sha/sha2-hash-4way.h" + #include "algo/sha/sha-hash-4way.h" #endif #if defined (SKEIN_4WAY) diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index 358ecd8..a070ca2 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -415,18 +415,46 @@ do { \ sc->bcount = bcount; \ } while (0) +/* +static const sph_u64 IV256[] = { + SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), + SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), + SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), + SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) +}; -static void -skein_big_init_4way( skein512_4way_context *sc, const sph_u64 *iv ) +static const sph_u64 IV512[] = { + SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), + SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), + SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), + SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) +}; +*/ + +void skein256_4way_init( skein256_4way_context *sc ) { - sc->h0 = _mm256_set_epi64x( iv[0], iv[0],iv[0],iv[0] ); - sc->h1 = _mm256_set_epi64x( iv[1], iv[1],iv[1],iv[1] ); - sc->h2 = _mm256_set_epi64x( iv[2], iv[2],iv[2],iv[2] ); - sc->h3 = _mm256_set_epi64x( iv[3], iv[3],iv[3],iv[3] ); - sc->h4 = _mm256_set_epi64x( iv[4], iv[4],iv[4],iv[4] ); - sc->h5 = _mm256_set_epi64x( iv[5], iv[5],iv[5],iv[5] ); - sc->h6 = _mm256_set_epi64x( iv[6], iv[6],iv[6],iv[6] ); - sc->h7 = _mm256_set_epi64x( iv[7], iv[7],iv[7],iv[7] ); + sc->h0 = m256_const1_64( 0xCCD044A12FDB3E13 ); + sc->h1 = m256_const1_64( 0xE83590301A79A9EB ); + sc->h2 = m256_const1_64( 0x55AEA0614F816E6F ); + sc->h3 = m256_const1_64( 0x2A2767A4AE9B94DB ); + sc->h4 = m256_const1_64( 0xEC06025E74DD7683 ); + sc->h5 = m256_const1_64( 0xE7A436CDC4746251 ); + sc->h6 = m256_const1_64( 0xC36FBAF9393AD185 ); + sc->h7 = m256_const1_64( 0x3EEDBA1833EDFC13 ); + sc->bcount = 0; + sc->ptr = 0; +} + +void skein512_4way_init( skein512_4way_context *sc ) +{ + sc->h0 = m256_const1_64( 0x4903ADFF749C51CE ); + sc->h1 = m256_const1_64( 0x0D95DE399746DF03 ); + sc->h2 = m256_const1_64( 0x8FD1934127C79BCE ); + sc->h3 = m256_const1_64( 0x9A255629FF352CB1 ); + sc->h4 = m256_const1_64( 0x5DB62599DF6CA7B0 ); + sc->h5 = m256_const1_64( 0xEABE394CA9D5C3F4 ); + sc->h6 = m256_const1_64( 0x991112C71A75B523 ); + sc->h7 = m256_const1_64( 0xAE18A40B660FCC33 ); sc->bcount = 0; sc->ptr = 0; } @@ -524,6 +552,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n, memcpy_256( dst, buf, out_len >> 3 ); } +/* static const sph_u64 IV256[] = { SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), @@ -537,13 +566,14 @@ static const sph_u64 IV512[] = { SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) }; - - +*/ +/* void skein256_4way_init(void *cc) { skein_big_init_4way(cc, IV256); } +*/ void skein256_4way(void *cc, const void *data, size_t len) @@ -557,11 +587,13 @@ skein256_4way_close(void *cc, void *dst) skein_big_close_4way(cc, 0, 0, dst, 32); } +/* void skein512_4way_init(void *cc) { skein_big_init_4way(cc, IV512); } +*/ void skein512_4way(void *cc, const void *data, size_t len) diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h index 106daeb..8ff9285 100644 --- a/algo/skein/skein-hash-4way.h +++ b/algo/skein/skein-hash-4way.h @@ -55,25 +55,26 @@ extern "C"{ #define SPH_SIZE_skein256 256 #define SPH_SIZE_skein512 512 -typedef struct { - __m256i buf[8] __attribute__ ((aligned (32))); - __m256i h0, h1, h2, h3, h4, h5, h6, h7; - size_t ptr; +typedef struct +{ + __m256i buf[8] __attribute__ ((aligned (64))); + __m256i h0, h1, h2, h3, h4, h5, h6, h7; + size_t ptr; sph_u64 bcount; } sph_skein_4way_big_context; typedef sph_skein_4way_big_context skein512_4way_context; typedef sph_skein_4way_big_context skein256_4way_context; -void skein512_4way_init(void *cc); -void skein512_4way(void *cc, const void *data, size_t len); -void skein512_4way_close(void *cc, void *dst); +void skein512_4way_init( skein512_4way_context *sc ); +void skein512_4way( void *cc, const void *data, size_t len ); +void skein512_4way_close( void *cc, void *dst ); //void sph_skein512_addbits_and_close( // void *cc, unsigned ub, unsigned n, void *dst); -void skein256_4way_init(void *cc); -void skein256_4way(void *cc, const void *data, size_t len); -void skein256_4way_close(void *cc, void *dst); +void skein256_4way_init( skein256_4way_context *sc ); +void skein256_4way( void *cc, const void *data, size_t len ); +void skein256_4way_close( void *cc, void *dst ); //void sph_skein256_addbits_and_close( // void *cc, unsigned ub, unsigned n, void *dst); diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 1338f8b..2cac7c9 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -27,7 +27,7 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" static __thread uint32_t s_ntime = UINT32_MAX; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c index db2b20c..a5adf15 100644 --- a/algo/x16/x16r-gate.c +++ b/algo/x16/x16r-gate.c @@ -92,7 +92,7 @@ void x16rt_getAlgoString( const uint32_t *timeHash, char *output) *sptr = '\0'; } -void x16rt_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) +void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) { uchar merkle_tree[64] = { 0 }; size_t t; @@ -204,7 +204,7 @@ bool register_x16rt_veil_algo( algo_gate_t* gate ) #endif gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; gate->set_target = (void*)&alt_set_target; - gate->build_extraheader = (void*)&x16rt_build_extraheader; + gate->build_extraheader = (void*)&veil_build_extraheader; return true; }; diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c index a0941da..7ac133b 100644 --- a/algo/x16/x16rt-4way.c +++ b/algo/x16/x16rt-4way.c @@ -21,7 +21,7 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" static __thread uint32_t s_ntime = UINT32_MAX; static __thread bool s_implemented = false; diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c index 31e3f27..ee33000 100644 --- a/algo/x16/x21s-4way.c +++ b/algo/x16/x21s-4way.c @@ -27,7 +27,7 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" #include "algo/haval/haval-hash-4way.h" #include "algo/tiger/sph_tiger.h" #include "algo/gost/sph_gost.h" diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index 39a037a..b5ec133 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -23,7 +23,7 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" union _sonoa_4way_context_overlay { diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index f34f7ee..77cc6e6 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -22,7 +22,7 @@ #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/haval-hash-4way.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" union _x17_4way_context_overlay { @@ -210,11 +210,11 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce, uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash7 = &(hash[7<<2]); uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t n = pdata[19]; + const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; __m256i *noncev = (__m256i*)vdata + 9; // aligned - int thr_id = mythr->id; // thr_id arg is deprecated + uint32_t n = first_nonce; + const int thr_id = mythr->id; const uint32_t Htarg = ptarget[7]; uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; @@ -225,7 +225,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce, mm256_bswap32_intrlv80_4x64( vdata, pdata ); for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) { - uint32_t mask = masks[ m ]; + const uint32_t mask = masks[ m ]; do { *noncev = mm256_intrlv_blend_32( mm256_bswap_32( diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index aad5b27..91a2a9f 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -22,7 +22,7 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sha2-hash-4way.h" +#include "algo/sha/sha-hash-4way.h" #include "algo/haval/haval-hash-4way.h" union _xevan_4way_context_overlay diff --git a/configure b/configure index 62cc85e..5d30396 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.6.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.6.2. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.9.6.1' -PACKAGE_STRING='cpuminer-opt 3.9.6.1' +PACKAGE_VERSION='3.9.6.2' +PACKAGE_STRING='cpuminer-opt 3.9.6.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.9.6.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.9.6.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.9.6.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.9.6.2:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.9.6.1 +cpuminer-opt configure 3.9.6.2 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.9.6.1, which was +It was created by cpuminer-opt $as_me 3.9.6.2, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.9.6.1' + VERSION='3.9.6.2' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.9.6.1, which was +This file was extended by cpuminer-opt $as_me 3.9.6.2, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.9.6.1 +cpuminer-opt config.status 3.9.6.2 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index c80f89e..3573cf6 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.9.6.1]) +AC_INIT([cpuminer-opt], [3.9.6.2]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 0141228..cbf1f80 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2044,7 +2044,7 @@ static void *miner_thread( void *userdata ) else applog( LOG_NOTICE, "Mining timeout of %ds reached, exiting...", opt_time_limit); - proper_exit(0); + proper_exit(0); } if (remain < max64) max64 = remain; } @@ -2079,7 +2079,7 @@ static void *miner_thread( void *userdata ) hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 ); pthread_mutex_unlock( &stats_lock ); } - // if nonce(s) found submit work + // If unsubmiited nonce(s) found, submit. if ( nonce_found && !opt_benchmark ) { if ( !submit_work( mythr, &work ) ) @@ -2242,7 +2242,7 @@ static void *miner_thread( void *userdata ) thr_id, hc, hc_units, hr, hr_units ); } } - if ( thr_id == 0 ) + if ( thr_id == 0 && !opt_benchmark ) { hashcount = 0.; hashrate = 0.; diff --git a/miner.h b/miner.h index 150bac8..ac0e4e9 100644 --- a/miner.h +++ b/miner.h @@ -510,9 +510,9 @@ enum algos { ALGO_AXIOM, ALGO_BASTION, ALGO_BLAKE, - ALGO_BLAKECOIN, -// ALGO_BLAKE2B, + ALGO_BLAKE2B, ALGO_BLAKE2S, + ALGO_BLAKECOIN, ALGO_BMW, ALGO_BMW512, ALGO_C11, @@ -604,9 +604,9 @@ static const char* const algo_names[] = { "axiom", "bastion", "blake", - "blakecoin", -// "blake2b", + "blake2b", "blake2s", + "blakecoin", "bmw", "bmw512", "c11", @@ -761,8 +761,9 @@ Options:\n\ axiom Shabal-256 MemoHash\n\ bastion\n\ blake blake256r14 (SFR)\n\ - blakecoin blake256r8\n\ + blake2b Blake2b 256\n\ blake2s Blake-2 S\n\ + blakecoin blake256r8\n\ bmw BMW 256\n\ bmw512 BMW 512\n\ c11 Chaincoin\n\ diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index d359a87..1840896 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -677,41 +677,40 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src ) { __m512i s0 = mm512_bswap_32( casti_m512i( src, 0 ) ); __m128i s1 = mm128_bswap_32( casti_m128i( src, 4 ) ); - const __m512i zero = m512_zero; - const __m512i one = m512_one_32; - const __m512i two = _mm512_add_epi32( one, one ); - const __m512i three = _mm512_add_epi32( two, one ); - const __m512i four = _mm512_add_epi32( two, two ); - const __m512i eight = _mm512_add_epi32( four, four ); - const __m512i eleven = _mm512_add_epi32( eight, three ); + const __m512i one = m512_one_32; + const __m512i two = _mm512_add_epi32( one, one ); + const __m512i three = _mm512_add_epi32( two, one ); + __m512i x = _mm512_add_epi32( three, three ); - casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero ); - casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one ); - casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two ); - casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three ); - casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four ); + casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, m512_zero ); + casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one ); + casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two ); + casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three ); + casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, + _mm512_add_epi32( two, two ) ); casti_m512i( d, 5 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( four, one ) ); - casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( four, two ) ); + _mm512_add_epi32( three, two ) ); + casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0, x ); casti_m512i( d, 7 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( four, three ) ); - casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0, eight ); - casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eight, one ) ); + _mm512_add_epi32( x, one ) ); + casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0, + _mm512_add_epi32( x, two ) ); + x = _mm512_add_epi32( x, three ); + casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0, x ); casti_m512i( d,10 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eight, two ) ); - casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0, eleven ); - casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, one ) ); - casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, two ) ); - casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, three ) ); + _mm512_add_epi32( x, one ) ); + casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0, + _mm512_add_epi32( x, two ) ); + x = _mm512_add_epi32( x, three ); + casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0, x ); + casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0, + _mm512_add_epi32( x, one ) ); + casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0, + _mm512_add_epi32( x, two ) ); casti_m512i( d,15 ) = _mm512_permutexvar_epi32( s0, - _mm512_add_epi32( eleven, four ) ); + _mm512_add_epi32( x, three ) ); casti_m512i( d,16 ) = _mm512_permutexvar_epi32( - _mm512_castsi128_si512( s1 ), zero ); + _mm512_castsi128_si512( s1 ), m512_zero ); casti_m512i( d,17 ) = _mm512_permutexvar_epi32( _mm512_castsi128_si512( s1 ), one ); casti_m512i( d,18 ) = _mm512_permutexvar_epi32( @@ -769,14 +768,14 @@ static inline void dintrlv_2x64( void *dst0, void *dst1, // 4x64 (AVX2) -static inline void intrlv_4x64( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, int bit_len ) +static inline void intrlv_4x64( void *dst, void *src0, + void *src1, void *src2, void *src3, int bit_len ) { uint64_t *d = (uint64_t*)dst; - const uint64_t *s0 = (const uint64_t*)src0; - const uint64_t *s1 = (const uint64_t*)src1; - const uint64_t *s2 = (const uint64_t*)src2; - const uint64_t *s3 = (const uint64_t*)src3; + uint64_t *s0 = (uint64_t*)src0; + uint64_t *s1 = (uint64_t*)src1; + uint64_t *s2 = (uint64_t*)src2; + uint64_t *s3 = (uint64_t*)src3; d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s2[ 0]; d[ 3] = s3[ 0]; d[ 4] = s0[ 1]; d[ 5] = s1[ 1]; d[ 6] = s2[ 1]; d[ 7] = s3[ 1]; d[ 8] = s0[ 2]; d[ 9] = s1[ 2]; d[ 10] = s2[ 2]; d[ 11] = s3[ 2]; @@ -870,10 +869,12 @@ static inline void extr_lane_4x64( void *d, const void *s, ((uint64_t*)d)[ 1] = ((uint64_t*)s)[ lane+ 4 ]; ((uint64_t*)d)[ 2] = ((uint64_t*)s)[ lane+ 8 ]; ((uint64_t*)d)[ 3] = ((uint64_t*)s)[ lane+12 ]; + if ( bit_len <= 256 ) return; ((uint64_t*)d)[ 4] = ((uint64_t*)s)[ lane+16 ]; ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+20 ]; ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+24 ]; ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+28 ]; +/* if ( bit_len <= 256 ) return; ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+32 ]; ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+36 ]; @@ -883,6 +884,7 @@ static inline void extr_lane_4x64( void *d, const void *s, ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+52 ]; ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+56 ]; ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+60 ]; +*/ } #if defined(__AVX2__) @@ -984,10 +986,12 @@ static inline void extr_lane_8x64( void *d, const void *s, ((uint64_t*)d)[ 1] = ((uint64_t*)s)[ lane+ 8 ]; ((uint64_t*)d)[ 2] = ((uint64_t*)s)[ lane+ 16 ]; ((uint64_t*)d)[ 3] = ((uint64_t*)s)[ lane+ 24 ]; + if ( bit_len <= 256 ) return; ((uint64_t*)d)[ 4] = ((uint64_t*)s)[ lane+ 32 ]; ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+ 40 ]; ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+ 48 ]; ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+ 56 ]; +/* if ( bit_len <= 256 ) return; ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+ 64 ]; ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+ 72 ]; @@ -997,6 +1001,7 @@ static inline void extr_lane_8x64( void *d, const void *s, ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+104 ]; ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+112 ]; ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+120 ]; +*/ } #if defined(__AVX512F__) && defined(__AVX512VL__) @@ -1006,13 +1011,13 @@ static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src ) __m512i *d = (__m512i*)dst; __m512i s0 = mm512_bswap_32( casti_m512i(src, 0 ) ); __m128i s1 = mm128_bswap_32( casti_m128i(src, 4 ) ); - const __m512i zero = m512_zero; +// const __m512i zero = m512_zero; const __m512i one = m512_one_64; const __m512i two = _mm512_add_epi64( one, one ); const __m512i three = _mm512_add_epi64( two, one ); const __m512i four = _mm512_add_epi64( two, two ); - d[0] = _mm512_permutexvar_epi64( s0, zero ); + d[0] = _mm512_permutexvar_epi64( s0, m512_zero ); d[1] = _mm512_permutexvar_epi64( s0, one ); d[2] = _mm512_permutexvar_epi64( s0, two ); d[3] = _mm512_permutexvar_epi64( s0, three ); @@ -1021,7 +1026,7 @@ static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src ) d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two ) ); d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, three ) ); d[8] = _mm512_permutexvar_epi64( - _mm512_castsi128_si512( s1 ), zero ); + _mm512_castsi128_si512( s1 ), m512_zero ); d[9] = _mm512_permutexvar_epi64( _mm512_castsi128_si512( s1 ), one ); } diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index bad84b0..07d630e 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -10,29 +10,23 @@ // SSE2 is generally required for full 128 bit support. Some functions // are also optimized with SSSE3 or SSE4.1. // -// Do not call _mm_extract directly, it isn't supported in SSE2. -// Use mm128_extr instead, it will select the appropriate implementation. +// Do not call intrinsic _mm_extract directly, it isn't supported in SSE2. +// Use mm128_extr macro instead, it will select the appropriate implementation. // // 128 bit operations are enhanced with uint128 which adds 128 bit integer // support for arithmetic and other operations. Casting to uint128_t is not -// efficient but is sometimes the only way for certain operations. +// free but is sometimes the only way for certain operations. // // Constants are an issue with simd. Simply put, immediate constants don't -// exist. All simd constants either reside in memory or a register. -// The distibction is made below with c128 being memory resident defined -// at compile time and m128 being register defined at run time. +// exist. All simd constants either reside in memory or a register and +// must be loaded or generated at run time. // -// All run time constants must be generated using their components elements -// incurring significant overhead. The more elements the more overhead -// both in instructions and in GP register usage. Whenever possible use -// 64 bit constant elements regardless of the actual element size. -// -// Due to the cost of generating constants they should not be regenerated -// in the same function. Instead, define a local const. +// Due to the cost of generating constants it is often more efficient to +// define a local const for repeated references to the same constant. // // Some constant values can be generated using shortcuts. Zero for example // is as simple as XORing any register with itself, and is implemented -// in the setzero instrinsic. These shortcuts must be implemented is asm +// in the setzero instrinsic. These shortcuts must be implemented using ASM // due to doing things the compiler would complain about. Another single // instruction constant is -1, defined below. Others may be added as the need // arises. Even single instruction constants are less efficient than local @@ -43,87 +37,59 @@ // into account. Those that generate a simd constant should not be used // repeatedly. It may be better for the application to reimplement the // utility to better suit its usage. -// -// More tips: -// -// Conversions from integer to vector should be avoided whenever possible. -// Extract, insert and set and set1 instructions should be avoided. -// In addition to the issues with constants set is also very inefficient with -// variables. -// Converting integer data to perform a couple of vector operations -// then converting back to integer should be avoided. Converting data in -// registers should also be avoided. Conversion should be limited to buffers -// in memory where the data is loaded directly to vector registers, bypassing -// the integer to vector conversion. -// -// Pseudo constants. -// -// These can't be used for compile time initialization. -// These should be used for all simple vectors. -// Repeated usage of any simd pseudo-constant should use a locally defined -// const rather than recomputing it for every reference. #define m128_zero _mm_setzero_si128() -// As suggested by Intel... -// Arg passing for simd registers is assumed to be first output arg, -// then input args, then locals. This is probably wrong, gcc likely picks -// whichever register is currently holding the variable, or whichever -// register is available to hold it. Nevertheless, all args are specified -// by their arg number and local variables use registers starting at -// last arg + 1, by type. -// Output args don't need to be listed as clobbered. - +static inline __m128i m128_one_128_fn() +{ + register __m128i a; + asm( "movq $1, %0\n\t" + : "=x"(a) ); + return a; +} +#define m128_one_128 m128_one_128_fn() static inline __m128i m128_one_64_fn() { - __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubq %%xmm1, %0\n\t" + register uint64_t one = 1; + register __m128i a; + asm( "movq %1, %0\n\t" : "=x"(a) - : - : "xmm1" ); - return a; + : "r"(one) ); + return _mm_shuffle_epi32( a, 0x04 ); } #define m128_one_64 m128_one_64_fn() static inline __m128i m128_one_32_fn() { - __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubd %%xmm1, %0\n\t" + register uint32_t one = 1; + register __m128i a; + asm( "movd %1, %0\n\t" : "=x"(a) - : - : "xmm1" ); - return a; + : "r"(one) ); + return _mm_shuffle_epi32( a, 0x00 ); } #define m128_one_32 m128_one_32_fn() static inline __m128i m128_one_16_fn() { - __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubw %%xmm1, %0\n\t" + register uint32_t one = 0x00010001; + register __m128i a; + asm( "movd %1, %0\n\t" : "=x"(a) - : - : "xmm1" ); - return a; + : "r"(one) ); + return _mm_shuffle_epi32( a, 0x00 ); } #define m128_one_16 m128_one_16_fn() static inline __m128i m128_one_8_fn() { - __m128i a; - asm( "pxor %0, %0\n\t" - "pcmpeqd %%xmm1, %%xmm1\n\t" - "psubb %%xmm1, %0\n\t" + register uint32_t one = 0x01010101; + register __m128i a; + asm( "movd %1, %0\n\t" : "=x"(a) - : - : "xmm1" ); - return a; + : "r"(one) ); + return _mm_shuffle_epi32( a, 0x00 ); } #define m128_one_8 m128_one_8_fn() @@ -136,35 +102,73 @@ static inline __m128i m128_neg1_fn() } #define m128_neg1 m128_neg1_fn() +// move uint64_t to low bits of __m128i, zeros the rest +static inline __m128i mm128_mov64_128( uint64_t n ) +{ + register __m128i a; + asm( "movq %1, %0\n\t" + : "=x" (a) + : "r" (n) ); + return a; +} + +static inline __m128i mm128_mov32_128( uint32_t n ) +{ + register __m128i a; + asm( "movd %1, %0\n\t" + : "=x" (a) + : "r" (n) ); + return a; +} + +static inline uint64_t mm128_mov128_64( __m128i a ) +{ + register uint64_t n; + asm( "movq %1, %0\n\t" + : "=x" (n) + : "r" (a) ); + return n; +} + +static inline uint32_t mm128_mov128_32( __m128i a ) +{ + register uint32_t n; + asm( "movd %1, %0\n\t" + : "=x" (n) + : "r" (a) ); + return n; +} + #if defined(__SSE41__) -static inline __m128i m128_one_128_fn() -{ - __m128i a; - asm( "pinsrq $0, $1, %0\n\t" - "pinsrq $1, $0, %0\n\t" - : "=x"(a) ); - return a; -} -#define m128_one_128 m128_one_128_fn() - // alternative to _mm_set_epi64x, doesn't use mem, -// cost = 2 pinsrt, estimate 4 clocks. -static inline __m128i m128_const_64( uint64_t hi, uint64_t lo ) + +static inline __m128i m128_const_64( const uint64_t hi, const uint64_t lo ) { - __m128i a; - asm( "pinsrq $0, %2, %0\n\t" + register __m128i a; + asm( "movq %2, %0\n\t" "pinsrq $1, %1, %0\n\t" : "=x"(a) : "r"(hi), "r"(lo) ); return a; -} +} + +static inline __m128i m128_const1_64( const uint64_t n ) +{ + register __m128i a; + asm( "movq %1, %0\n\t" + "pinsrq $1, %1, %0\n\t" + : "=x"(a) + : "r"(n) ); + return a; +} #else -#define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL ) +// #define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL ) -#define m128_const_64 _mm_set_epi64x +#define m128_const_64 _mm_set_epi64x +#define m128_const1_64 _mm_set1_epi64x #endif @@ -309,13 +313,13 @@ do { \ // Assumes data is alinged and integral. // n = number of __m128i, bytes/16 -static inline void memset_zero_128( __m128i *dst, int n ) +static inline void memset_zero_128( __m128i *dst, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; } -static inline void memset_128( __m128i *dst, const __m128i a, int n ) +static inline void memset_128( __m128i *dst, const __m128i a, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = a; } -static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) +static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } @@ -383,13 +387,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) // // Rotate elements within lanes. +// Equivalent to mm128_ror_64( v, 32 ) #define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 ) +// Equivalent to mm128_ror_64( v, 16 ) #define mm128_ror16_64( v ) _mm_shuffle_epi8( v, \ m128_const_64( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) #define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \ m128_const_64( 0x0dc0b0a09080f0e, 0x0504030201000706 ) +// Equivalent to mm128_ror_32( v, 16 ) #define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \ m128_const_64( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) @@ -459,7 +466,7 @@ static inline __m128i mm128_bswap_16( __m128i v ) return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); } -static inline void mm128_block_bswap_64( __m128i *d, __m128i *s ) +static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s ) { d[0] = mm128_bswap_64( s[0] ); d[1] = mm128_bswap_64( s[1] ); @@ -471,7 +478,7 @@ static inline void mm128_block_bswap_64( __m128i *d, __m128i *s ) d[7] = mm128_bswap_64( s[7] ); } -static inline void mm128_block_bswap_32( __m128i *d, __m128i *s ) +static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) { d[0] = mm128_bswap_32( s[0] ); d[1] = mm128_bswap_32( s[1] ); diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 9f7a233..a6882e9 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -15,91 +15,88 @@ // is available. // -// Pseudo constants. -// These can't be used for compile time initialization but are preferable -// for simple constant vectors at run time. For repeated use define a local -// constant to avoid multiple calls to the same macro. +// All SIMD constant macros are actually functions containing executable +// code and therefore can't be used as compile time initializers. #define m256_zero _mm256_setzero_si256() #define m256_one_256 \ - _mm256_insertf128_si256( _mm256_castsi128_si256( m128_one_128 ), \ + _mm256_inserti128_si256( _mm256_castsi128_si256( m128_one_128 ), \ m128_zero, 1 ) #define m256_one_128 \ - _mm256_insertf128_si256( _mm256_castsi128_si256( m128_one_128 ), \ + _mm256_inserti128_si256( _mm256_castsi128_si256( m128_one_128 ), \ m128_one_128, 1 ) // set instructions load memory resident constants, this avoids mem. -// cost 4 pinsert + 1 vinsert, estimate 7 clocks. -// Avoid using, mm128_const_64 twice is still faster. +// cost 4 pinsert + 1 vinsert, estimate 8 clocks latency. + #define m256_const_64( i3, i2, i1, i0 ) \ - _mm256_insertf128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \ - m128_const_64( i3, i2 ), 1 ) -#define m256_const1_64( i ) m256_const_64( i, i, i, i ) + _mm256_inserti128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \ + m128_const_64( i3, i2 ), 1 ) + +static inline __m256i m256_const1_64( uint64_t i ) +{ + register __m128i a; + asm( "movq %1, %0\n\t" + : "=x"(a) + : "r"(i) ); + return _mm256_broadcastq_epi64( a ); +} #if defined(__AVX2__) -// These look like a lot of overhead but the compiler optimizes nicely -// and puts the asm inline in the calling function. Usage is like any -// variable expression. +// Don't call the frunction directly, use the macro to make appear like +// a constant identifier instead of a function. // __m256i foo = m256_one_64; static inline __m256i m256_one_64_fn() { - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubq %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; + register uint64_t one = 1; + register __m128i a; + asm( "movq %1, %0\n\t" + : "=x" (a) + : "r" (one) ); + return _mm256_broadcastq_epi64( a ); } #define m256_one_64 m256_one_64_fn() static inline __m256i m256_one_32_fn() { - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubd %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; + register uint64_t one = 0x0000000100000001; + register __m128i a; + asm( "movq %1, %0\n\t" + : "=x" (a) + : "r" (one) ); + return _mm256_broadcastq_epi64( a ); } #define m256_one_32 m256_one_32_fn() static inline __m256i m256_one_16_fn() { - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubw %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; + register uint64_t one = 0x0001000100010001; + register __m128i a; + asm( "movq %1, %0\n\t" + : "=x" (a) + : "r" (one) ); + return _mm256_broadcastq_epi64( a ); } #define m256_one_16 m256_one_16_fn() static inline __m256i m256_one_8_fn() { - __m256i a; - asm( "vpxor %0, %0, %0\n\t" - "vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t" - "vpsubb %%ymm1, %0, %0\n\t" - : "=x"(a) - : - : "ymm1" ); - return a; + register uint64_t one = 0x0101010101010101; + register __m128i a; + asm( "movq %1, %0\n\t" + : "=x" (a) + : "r" (one) ); + return _mm256_broadcastq_epi64( a ); } #define m256_one_8 m256_one_8_fn() static inline __m256i m256_neg1_fn() { - __m256i a; + register __m256i a; asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) ); return a; @@ -114,16 +111,16 @@ static inline __m256i m256_neg1_fn() #define m256_one_8 _mm256_set1_epi64x( 0x0101010101010101ULL ) // AVX doesn't have inserti128 but insertf128 will do. -// Ideally this can be done with 2 instructions and no temporary variables. static inline __m256i m256_neg1_fn() { __m128i a = m128_neg1; return _mm256_insertf128_si256( _mm256_castsi128_si256( a ), a, 1 ); } #define m256_neg1 m256_neg1_fn() -//#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) #endif // AVX2 else AVX + + // // Vector size conversion. // @@ -139,11 +136,11 @@ static inline __m256i m256_neg1_fn() #define mm128_extr_lo128_256( a ) _mm256_castsi256_si128( a ) #define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 ) -// Extract 4 u64 from 256 bit vector. +// Extract integers from 256 bit vector, ineficient, avoid if possible.. #define mm256_extr_4x64( a0, a1, a2, a3, src ) \ do { \ __m128i hi = _mm256_extracti128_si256( src, 1 ); \ - a0 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 0 ); \ + a0 = mm256_mov256_64( src ); \ a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \ a2 = _mm_extract_epi64( hi, 0 ); \ a3 = _mm_extract_epi64( hi, 1 ); \ @@ -152,28 +149,43 @@ do { \ #define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \ do { \ __m128i hi = _mm256_extracti128_si256( src, 1 ); \ - a0 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 0 ); \ + a0 = mm256_mov256_32( src ); \ a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \ a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \ a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \ - a4 = _mm_extract_epi32( hi, 0 ); \ + a4 = mm128_mov128_32( hi ); \ a5 = _mm_extract_epi32( hi, 1 ); \ a6 = _mm_extract_epi32( hi, 2 ); \ a7 = _mm_extract_epi32( hi, 3 ); \ } while(0) -// input __m128i, returns __m256i -// To build a 256 bit vector from 2 128 bit vectors lo must be done first. -// lo alone leaves hi undefined, hi alone leaves lo unchanged. -// Both cost one clock while preserving the other half.. -// Insert b into specified half of a leaving other half of a unchanged. -#define mm256_ins_lo128_256( a, b ) _mm256_inserti128_si256( a, b, 0 ) -#define mm256_ins_hi128_256( a, b ) _mm256_inserti128_si256( a, b, 1 ) - - // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo } #define mm256_concat_128( hi, lo ) \ - mm256_ins_hi128_256( _mm256_castsi128_si256( lo ), hi ) + _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) + +// Move integer to lower bits of vector, upper bits set to zero. +static inline __m256i mm256_mov64_256( uint64_t n ) +{ + register __m128i a; + asm( "movq %1, %0\n\t" + : "=x" (a) + : "r" (n) ); + return _mm256_castsi128_si256( a ); +} + +static inline __m256i mm256_mov32_256( uint32_t n ) +{ + register __m128i a; + asm( "movd %1, %0\n\t" + : "=x" (a) + : "r" (n) ); + return _mm256_castsi128_si256( a ); +} + +// Move lo bits of vector to integer, hi bits are truncated. +#define mm256_mov256_64( a ) mm128_mov128_64( _mm256_castsi256_si128( a ) ) + +#define mm256_mov256_32( a ) mm128_mov128_32( _mm256_castsi256_si128( a ) ) // Horizontal vector testing #if defined(__AVX2__) @@ -276,13 +288,13 @@ do { \ // Memory functions // n = number of 256 bit (32 byte) vectors -static inline void memset_zero_256( __m256i *dst, int n ) +static inline void memset_zero_256( __m256i *dst, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; } -static inline void memset_256( __m256i *dst, const __m256i a, int n ) +static inline void memset_256( __m256i *dst, const __m256i a, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = a; } -static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) +static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } /////////////////////////////// @@ -397,7 +409,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) #define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) // A little faster with avx512 -// Rotate 256 bit vector by one 32 bit element. Use 64 bit set, it's faster. +// Rotate 256 bit vector by one 32 bit element. #define mm256_ror_1x32( v ) \ _mm256_permutevar8x32_epi32( v, \ m256_const_64( 0x0000000000000007, 0x0000000600000005, \ @@ -455,24 +467,28 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) #define mm256_rol1x32_128( v ) _mm256_shuffle_epi32( v, 0x93 ) // Rotate each 128 bit lane by one 16 bit element. -#define mm256_rol1x16_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 6,5,4,3,2,1,0,7, \ - 6,5,4,3,2,1,0,7 ) ) #define mm256_ror1x16_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 0,7,6,5,4,3,2,1, \ - 0,7,6,5,4,3,2,1 ) ) + _mm256_shuffle_epi8( v, m256_const_64( 0x01000f0e0d0c0b0a, \ + 0x0908070605040302, \ + 0x01000f0e0d0c0b0a, \ + 0x0908070605040302 ) ) +#define mm256_rol1x16_128( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x0d0c0b0a09080706, \ + 0x0504030201000f0e, \ + 0x0d0c0b0a09080706, \ + 0x0504030201000f0e ) ) // Rotate each 128 bit lane by one byte -#define mm256_rol1x8_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi8(14,13,12,11,10, 9, 8, 7, \ - 6, 5, 4, 3, 2, 1, 0,15, \ - 14,13,12,11,10, 9, 8, 7, \ - 6, 5, 4, 3, 2, 1, 0,15 ) ) #define mm256_ror1x8_128( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi8( 0,15,14,13,12,11,10, 9, \ - 8, 7, 6, 5, 4, 3, 2, 1, \ - 0,15,14,13,12,11,10, 9, \ - 8, 7, 6, 5, 4, 3, 2, 1 ) ) + _mm256_shuffle_epi8( v, m256_const_64( 0x000f0e0d0c0b0a09, \ + 0x0807060504030201, \ + 0x000f0e0d0c0b0a09, \ + 0x0807060504030201 ) ) +#define mm256_rol1x8_128( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x0c0b0a09080f0e0d, \ + 0x0504030201000706, \ + 0x0d0c0b0a09080f0e, \ + 0x0504030201000706 ) ) // Rotate each 128 bit lane by c bytes. #define mm256_bror_128( v, c ) \ @@ -485,34 +501,65 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) // Swap 32 bit elements in each 64 bit lane #define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 ) -#define mm256_ror16_64( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 4,7,6,5,0,3,2,1, \ - 4,7,6,5,0,3,2,1 ) ) -#define mm256_rol16_64( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 6,5,4,7,2,1,0,3, \ - 6,5,4,7,2,1,0,3 ) ) +#define mm256_ror1x16_64( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x09080f0e0d0c0b0a, \ + 0x0100070605040302, \ + 0x09080f0e0d0c0b0a, \ + 0x0100070605040302 ) ) +#define mm256_rol1x16_64( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x0d0c0b0a09080f0e, \ + 0x0504030201000706, \ + 0x0d0c0b0a09080f0e, \ + 0x0504030201000706 )) +#define mm256_ror1x8_64( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x080f0e0d0c0b0a09, \ + 0x0007060504030201, \ + 0x080f0e0d0c0b0a09, \ + 0x0007060504030201 )) +#define mm256_rol1x8_64( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x0e0d0c0b0a09080f, \ + 0x0605040302010007, \ + 0x0e0d0c0b0a09080f, \ + 0x0605040302010007 ) ) + +#define mm256_ror3x8_64( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x0a09080f0e0d0c0b, \ + 0x0201000706050403, \ + 0x0a09080f0e0d0c0b, \ + 0x0201000706050403 ) ) +#define mm256_rol3x8_64( v ) \ + _mm256_shuffle_epi8( v, m256_const_64( 0x0c0b0a09080f0e0d, \ + 0x0403020100070605, \ + 0x0c0b0a09080f0e0d, \ + 0x0403020100070605 ) ) // Swap 16 bit elements in each 32 bit lane #define mm256_swap16_32( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi16( 6,7,4,5,2,3,0,1, \ - 6,7,4,5,2,3,0,1 ) ) + _mm256_shuffle_epi8( v, m256_const_64( 0x0b0a09080f0e0d0c, \ + 0x0302010007060504, \ + 0x0b0a09080f0e0d0c, \ + 0x0302010007060504 ) // // Swap bytes in vector elements, endian bswap. #define mm256_bswap_64( v ) \ _mm256_shuffle_epi8( v, m256_const_64( 0x08090a0b0c0d0e0f, \ - 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ) + 0x0001020304050607, \ + 0x08090a0b0c0d0e0f, \ + 0x0001020304050607 ) ) #define mm256_bswap_32( v ) \ _mm256_shuffle_epi8( v, m256_const_64( 0x0c0d0e0f08090a0b, \ - 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ) + 0x0405060700010203, \ + 0x0c0d0e0f08090a0b, \ + 0x0405060700010203 ) ) #define mm256_bswap_16( v ) \ - _mm256_shuffle_epi8( v, _mm256_set_epi8( 14,15, 12,13, 10,11, 8, 9, \ - 6, 7, 4, 5, 2, 3, 0, 1, \ - 14,15, 12,13, 10,11, 8, 9, \ - 6, 7, 4, 5, 2, 3, 0, 1 ) ) + _mm256_shuffle_epi8( v, m256_const_64( 0x0e0f0c0d0a0b0809, \ + 0x0607040502030001, \ + 0x0e0f0c0d0a0b0809, \ + 0x0607040502030001 ) ) // 8 byte qword * 8 qwords * 4 lanes = 256 bytes #define mm256_block_bswap_64( d, s ) do \