From aa47e880d5c0356eaba47ba3f3a8bb36df02ff9e Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Wed, 9 Jul 2025 01:32:38 -0400 Subject: [PATCH] v25.5 --- RELEASE_NOTES | 6 + algo/sha/sha256-hash-4way.c | 370 +++++++++++++++--------------------- algo/sha/sha512-hash-4way.c | 33 ---- build-allarch.sh | 58 +++--- clean-all.sh | 2 +- configure | 28 +-- configure.ac | 2 +- configure~ | 20 +- cpu-miner.c | 4 +- simd-utils/simd-256.h | 4 +- simd-utils/simd-512.h | 8 +- simd-utils/simd-neon.h | 5 +- 12 files changed, 221 insertions(+), 319 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index ae8ff13..1c189d3 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,12 @@ If not what makes it happen or not happen? Change Log ---------- +v25.5 + +x86_64: Fixed and insidious bug in sha256 early rejection optimization for AVX2 & AVX512. +x86_64: Faster sha256d, sha256dt for AVX2 & AVX512. +Other small bug fixes. + v25.4 x86_64: improved handling of vector constants used for byte permutations. diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index a41f6e5..77b1241 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -441,57 +441,6 @@ void sha256_4x32_full( void *dst, const void *data, size_t len ) W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \ W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); -#if defined(VL256) -// AVX512 or AVX10-256 - -#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca ) - -#define MAJx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xe8 ) - -#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \ -do { \ - __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \ - W[ i ] ); \ - __m256i T1 = BSG2_1x( E ); \ - __m256i T2 = BSG2_0x( A ); \ - T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ - T1 = _mm256_add_epi32( T1, H ); \ - T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \ - T1 = _mm256_add_epi32( T1, T0 ); \ - D = _mm256_add_epi32( D, T1 ); \ - H = _mm256_add_epi32( T1, T2 ); \ -} while (0) - -#define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ - SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \ - SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \ - SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, j ); \ - SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, j ); \ - SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, j ); \ - SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, j ); \ - SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, j ); \ - SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, j ); \ - SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, j ); \ - SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, j ); \ - SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, j ); \ - SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, j ); \ - SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, j ); \ - SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, j ); \ - SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \ - SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j ); - -// Not used with AVX512, needed to satisfy the compiler -#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ -{ \ - __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \ - v256_32( K256[(i)+(j)] ) ); \ - __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ - D = _mm256_add_epi32( D, T1 ); \ - H = _mm256_add_epi32( T1, T2 ); \ -} - -#else // AVX2 - #define CHx(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) @@ -503,61 +452,58 @@ do { \ #define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ { \ - __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \ + H = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \ v256_32( K256[(i)+(j)] ) ); \ - __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ + __m256i T = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \ Y_xor_Z = X_xor_Y; \ - D = _mm256_add_epi32( D, T1 ); \ - H = _mm256_add_epi32( T1, T2 ); \ + D = _mm256_add_epi32( D, H ); \ + H = _mm256_add_epi32( H, T ); \ } #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \ -do { \ - __m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \ - __m256i T1 = BSG2_1x( E ); \ +{ \ + __m256i T1 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \ + H = _mm256_add_epi32( H, BSG2_1x( E ) ); \ __m256i T2 = BSG2_0x( A ); \ - T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ - T1 = _mm256_add_epi32( T1, H ); \ + T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \ T2 = _mm256_add_epi32( T2, MAJx( A, B, C ) ); \ - T1 = _mm256_add_epi32( T1, T0 ); \ + H = _mm256_add_epi32( H, T1 ); \ Y_xor_Z = X_xor_Y; \ - D = _mm256_add_epi32( D, T1 ); \ - H = _mm256_add_epi32( T1, T2 ); \ -} while (0) + D = _mm256_add_epi32( D, H ); \ + H = _mm256_add_epi32( H, T2 ); \ +} // read Y_xor_Z, update X_xor_Y #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ Y_xor_Z ) ) -// start with toc initialized to y^z: toc = B ^ C +// start with toc initialized to y^z, toc = B ^ C for first ound. // First round reads toc as Y_xor_Z and saves X_xor_Y as tic. // Second round reads tic as Y_xor_Z and saves X_xor_Y as toc. #define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \ -do { \ - __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \ +{ \ + __m256i T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \ W[ i0 ] ); \ - __m256i T1 = BSG2_1x( E ); \ + H = _mm256_add_epi32( H, BSG2_1x( E ) ); \ __m256i T2 = BSG2_0x( A ); \ - T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \ - T1 = _mm256_add_epi32( T1, H ); \ + T1 = _mm256_add_epi32( T1, CHx( E, F, G ) ); \ T2 = _mm256_add_epi32( T2, MAJ_2step( A, B, C, tic, toc ) ); \ - T1 = _mm256_add_epi32( T1, T0 ); \ - D = _mm256_add_epi32( D, T1 ); \ - H = _mm256_add_epi32( T1, T2 ); \ + H = _mm256_add_epi32( H, T1 ); \ + D = _mm256_add_epi32( D, H ); \ + H = _mm256_add_epi32( H, T2 ); \ \ - T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \ + T1 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \ W[ (i1) ] ); \ - T1 = BSG2_1x( D ); \ + G = _mm256_add_epi32( G, BSG2_1x( D ) ); \ T2 = BSG2_0x( H ); \ - T0 = _mm256_add_epi32( T0, CHx( D, E, F ) ); \ - T1 = _mm256_add_epi32( T1, G ); \ + T1 = _mm256_add_epi32( T1, CHx( D, E, F ) ); \ T2 = _mm256_add_epi32( T2, MAJ_2step( H, A, B, toc, tic ) ); \ - T1 = _mm256_add_epi32( T1, T0 ); \ - C = _mm256_add_epi32( C, T1 ); \ - G = _mm256_add_epi32( T1, T2 ); \ -} while (0) + G = _mm256_add_epi32( G, T1 ); \ + C = _mm256_add_epi32( C, G ); \ + G = _mm256_add_epi32( G, T2 ); \ +} #define SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ { \ @@ -572,8 +518,6 @@ do { \ SHA256_8WAY_2ROUNDS( C, D, E, F, G, H, A, B, 14, 15, j ); \ } -#endif // AVX512VL else AVX2 - static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W, const __m256i *in ) \ { @@ -650,9 +594,7 @@ void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X, G = _mm256_load_si256( state_in + 6 ); H = _mm256_load_si256( state_in + 7 ); -#if !defined(VL256) __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); -#endif SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 ); SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 ); @@ -692,9 +634,7 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data, G = _mm256_load_si256( state_mid + 6 ); H = _mm256_load_si256( state_mid + 7 ); -#if !defined(VL256) __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G ); -#endif // round 3 part 2, add nonces A = _mm256_add_epi32( A, W[3] ); @@ -779,10 +719,10 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data, int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, const __m256i *state_in, const uint32_t *target ) { - __m256i A, B, C, D, E, F, G, H, T0, T1, T2; + __m256i A, B, C, D, E, F, G, H, G57, H56; __m256i vmask, targ, hash; __m256i W[16]; memcpy_256( W, data, 16 ); - uint8_t flip, t6_mask; + uint8_t flip, t6_mask, t7_mask; A = _mm256_load_si256( state_in ); B = _mm256_load_si256( state_in+1 ); @@ -793,12 +733,10 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, G = _mm256_load_si256( state_in+6 ); H = _mm256_load_si256( state_in+7 ); - const __m256i IV7 = H; - const __m256i IV6 = G; + const __m256i istate6 = G; + const __m256i istate7 = H; -#if !defined(VL256) __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); -#endif // rounds 0 to 16, ignore zero padding W[9..14] SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 ); @@ -841,11 +779,9 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, W[11] = SHA256_8WAY_MEXP( W[ 9], W[ 4], W[12], W[11] ); W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] ); -#if !defined(VL256) Y_xor_Z = _mm256_xor_si256( B, C ); -#endif - // rounds 48 to 57 + // Rounds 48 to 55 SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 ); SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 ); SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 ); @@ -854,77 +790,83 @@ int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data, SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 ); SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 ); SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 ); - SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 48 ); - SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 ); - // round 58 to 60 part 1 - T0 = _mm256_add_epi32( v256_32( K256[58] ), + // Round 56 + H = _mm256_add_epi32( v256_32( K256[56] ), + mm256_add4_32( BSG2_1x( E ), CHx( E, F, G ), W[ 8], H ) ); + D = _mm256_add_epi32( D, H ); + H56 = _mm256_add_epi32( H, _mm256_add_epi32( BSG2_0x( A ), + MAJx( A, B, C ) ) ); + Y_xor_Z = X_xor_Y; + + // Rounds 57 to 60 part 1 + G = _mm256_add_epi32( v256_32( K256[57] ), + mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ), W[ 9], G ) ); + C = _mm256_add_epi32( C, G ); + G57 = _mm256_add_epi32( G, MAJx( H56, A, B ) ); + + F = _mm256_add_epi32( v256_32( K256[58] ), mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) ); - B = _mm256_add_epi32( B, T0 ); + B = _mm256_add_epi32( B, F ); - T1 = _mm256_add_epi32( v256_32( K256[59] ), + E = _mm256_add_epi32( v256_32( K256[59] ), mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) ); - A = _mm256_add_epi32( A, T1 ); + A = _mm256_add_epi32( A, E ); - T2 = _mm256_add_epi32( v256_32( K256[60] ), + D = _mm256_add_epi32( v256_32( K256[60] ), mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) ); - H = _mm256_add_epi32( H, T2 ); + H = _mm256_add_epi32( H56, D ); // Got H, test it. + hash = mm256_bswap_32( _mm256_add_epi32( H, istate7 ) ); targ = v256_32( target[7] ); - hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) ); - if ( target[7] ) - { - flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); - if ( likely( 0xff == ( flip ^ - mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) )) - return 0; - } + // A simple unsigned LE test is complicated by the lack of a cmple + // instruction, and lack of unsigned compares in AVX2. + flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); + if ( likely( 0xff == ( t7_mask = ( flip ^ + mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))) + return 0; t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) ); - // round 58 part 2 - F = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( G ), - MAJx( G, H, A ) ) ); - // round 61 part 1 - W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] ); - T0 = _mm256_add_epi32( v256_32( K256[61] ), - mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) ); - G = _mm256_add_epi32( G, T0 ); + // Round 57 part 2 + G57 = _mm256_add_epi32( G57, BSG2_0x( H56 ) ); + Y_xor_Z = X_xor_Y; - if ( t6_mask ) + // Round 61 part 1 + W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] ); + C = _mm256_add_epi32( v256_32( K256[61] ), + mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) ); + G = _mm256_add_epi32( G57, C ); + + if ( t6_mask == (0xff & ~t7_mask ) ) { // Testing H was inconclusive: hash7 == target7, need to test G targ = _mm256_and_si256( vmask, v256_32( target[6] ) ); - hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) ); - - if ( likely( 0 == ( t6_mask & mm256_movmask_32( - _mm256_cmpeq_epi32( hash, targ ) ) ) )) - { - flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); - if ( likely( 0 != ( t6_mask & ( flip ^ + hash = mm256_bswap_32( _mm256_add_epi32( G, istate6 ) ); + flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash ); + if ( likely( 0 != ( t6_mask & ( flip ^ mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) )) - return 0; - if ( likely( ( target[6] == 0x80000000 ) - && ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32( - hash, _mm256_xor_si256( hash, hash ) ) ) ) ) )) - return 0; - } -// else inconclusive, testing targ5 isn't practical, fininsh hashing + return 0; } -// At this point either the hash will be good or the test was inconclusive. -// If the latter it's probably a high target difficulty with a nearly equal -// high difficulty hash that has a good chance of being good. + // Rounds 58 to 61 part 2 + F = _mm256_add_epi32( F, _mm256_add_epi32( BSG2_0x( G57 ), + MAJx( G57, H, A ) ) ); + Y_xor_Z = X_xor_Y; - // rounds 59 to 61 part 2 - E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ), - MAJx( F, G, H ) ) ); - D = _mm256_add_epi32( T2, _mm256_add_epi32( BSG2_0x( E ), - MAJx( E, F, G ) ) ); - C = _mm256_add_epi32( T0, _mm256_add_epi32( BSG2_0x( D ), - MAJx( D, E, F ) ) ); + E = _mm256_add_epi32( E, _mm256_add_epi32( BSG2_0x( F ), + MAJx( F, G57, H ) ) ); + Y_xor_Z = X_xor_Y; - // rounds 62 & 63 + D = _mm256_add_epi32( D, _mm256_add_epi32( BSG2_0x( E ), + MAJx( E, F, G57 ) ) ); + Y_xor_Z = X_xor_Y; + + C = _mm256_add_epi32( C, _mm256_add_epi32( BSG2_0x( D ), + MAJx( D, E, F ) ) ); + Y_xor_Z = X_xor_Y; + + // Rounds 62 & 63 W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); @@ -1077,40 +1019,26 @@ void sha256_8x32_full( void *dst, const void *data, size_t len ) W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); #define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \ -do { \ - __m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \ - __m512i T1 = BSG2_1x16( E ); \ +{ \ + __m512i T1 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \ + H = _mm512_add_epi32( H, BSG2_1x16( E ) ); \ __m512i T2 = BSG2_0x16( A ); \ - T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \ - T1 = _mm512_add_epi32( T1, H ); \ + T1 = _mm512_add_epi32( T1, CHx16( E, F, G ) ); \ T2 = _mm512_add_epi32( T2, MAJx16( A, B, C ) ); \ - T1 = _mm512_add_epi32( T1, T0 ); \ - D = _mm512_add_epi32( D, T1 ); \ - H = _mm512_add_epi32( T1, T2 ); \ -} while (0) + H = _mm512_add_epi32( H, T1 ); \ + D = _mm512_add_epi32( D, H ); \ + H = _mm512_add_epi32( H, T2 ); \ +} #define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \ { \ - __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \ + H = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \ v512_32( K256[(i)+(j)] ) ); \ - __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ - D = _mm512_add_epi32( D, T1 ); \ - H = _mm512_add_epi32( T1, T2 ); \ + __m512i T = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ + D = _mm512_add_epi32( D, H ); \ + H = _mm512_add_epi32( H, T ); \ } -/* -#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \ -do { \ - __m512i T1, T2; \ - __m512i K = v512_32( K256[( (j)+(i) )] ); \ - T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \ - K, W[i] ) ); \ - T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \ - D = _mm512_add_epi32( D, T1 ); \ - H = _mm512_add_epi32( T1, T2 ); \ -} while (0) -*/ - #define SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, j ); \ SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, j ); \ @@ -1332,11 +1260,10 @@ void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data, int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, const __m512i *state_in, const uint32_t *target ) { - __m512i A, B, C, D, E, F, G, H, hash, targ; - __m512i T0, T1, T2; + __m512i A, B, C, D, E, F, G, H, hash, targ, G57, H56; __m512i W[16]; memcpy_512( W, data, 16 ); - __mmask16 t6_mask; - + __mmask16 mask; + A = _mm512_load_si512( state_in ); B = _mm512_load_si512( state_in+1 ); C = _mm512_load_si512( state_in+2 ); @@ -1346,9 +1273,9 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, G = _mm512_load_si512( state_in+6 ); H = _mm512_load_si512( state_in+7 ); - const __m512i IV6 = G; - const __m512i IV7 = H; - + const __m512i istate6 = G; + const __m512i istate7 = H; + // rounds 0 to 8 SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 ); SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 ); @@ -1419,7 +1346,7 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, W[11] = SHA256_16WAY_MEXP( W[ 9], W[ 4], W[12], W[11] ); W[12] = SHA256_16WAY_MEXP( W[10], W[ 5], W[13], W[12] ); - // Rounds 48 to 57 + // Rounds 48 to 55 SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 ); SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 ); SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 ); @@ -1428,62 +1355,67 @@ int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data, SHA256_16WAY_ROUND( D, E, F, G, H, A, B, C, 5, 48 ); SHA256_16WAY_ROUND( C, D, E, F, G, H, A, B, 6, 48 ); SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 7, 48 ); - SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 8, 48 ); - SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 9, 48 ); - // rounds 58 to 60 part 1 - T0 = _mm512_add_epi32( v512_32( K256[58] ), - mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) ); - B = _mm512_add_epi32( B, T0 ); + // Round 56 + H = _mm512_add_epi32( v512_32( K256[56] ), + mm512_add4_32( BSG2_1x16( E ), CHx16( E, F, G ), W[ 8], H ) ); + D = _mm512_add_epi32( D, H ); + H56 = _mm512_add_epi32( H, _mm512_add_epi32( BSG2_0x16( A ), + MAJx16( A, B, C ) ) ); - T1 = _mm512_add_epi32( v512_32( K256[59] ), + // Rounds 57 to 60 part 1 + G = _mm512_add_epi32( v512_32( K256[57] ), + mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) ); + C = _mm512_add_epi32( C, G ); + G57 = _mm512_add_epi32( G, MAJx16( H56, A, B ) ); + + F = _mm512_add_epi32( v512_32( K256[58] ), + mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) ); + B = _mm512_add_epi32( B, F ); + + E = _mm512_add_epi32( v512_32( K256[59] ), mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) ); - A = _mm512_add_epi32( A, T1 ); + A = _mm512_add_epi32( A, E ); - T2 = _mm512_add_epi32( v512_32( K256[60] ), + D = _mm512_add_epi32( v512_32( K256[60] ), mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) ); - H = _mm512_add_epi32( H, T2 ); + H = _mm512_add_epi32( H56, D ); - // got H, test it against target[7] - hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) ); + // got final H, test it against target[7] + hash = mm512_bswap_32( _mm512_add_epi32( H , istate7 ) ); targ = v512_32( target[7] ); - if ( target[7] ) - if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) )) + if ( likely( 0 == ( mask = _mm512_cmple_epu32_mask( hash, targ ) ) )) return 0; - t6_mask = _mm512_cmpeq_epi32_mask( hash, targ ); - // round 58 part 2 - F = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( G ), - MAJx16( G, H, A ) ) ); - - // round 61 part 1 + // Round 57 part 2 + G57 = _mm512_add_epi32( G57, BSG2_0x16( H56 ) ); + + // Round 61 part 1 W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] ); - T0 = _mm512_add_epi32( v512_32( K256[61] ), + C = _mm512_add_epi32( v512_32( K256[61] ), mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) ); - G = _mm512_add_epi32( G, T0 ); + G = _mm512_add_epi32( G57, C ); - // got G, test it against target[6] if indicated - if ( (uint16_t)t6_mask ) + // got final G, test it against target[6] if indicated. + if ( mask == _mm512_cmpeq_epi32_mask( hash, targ ) ) { - hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) ); + hash = mm512_bswap_32( _mm512_add_epi32( G, istate6 ) ); targ = v512_32( target[6] ); - if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) )) + if ( likely( 0 == _mm512_mask_cmple_epu32_mask( mask, hash, targ ) )) return 0; } - // round 59 part 2 - E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16( F ), - MAJx16( F, G, H ) ) ); - - // round 60 part 2 - D = _mm512_add_epi32( T2, _mm512_add_epi32( BSG2_0x16( E ), - MAJx16( E, F, G ) ) ); - - // round 61 part 2 - C = _mm512_add_epi32( T0, _mm512_add_epi32( BSG2_0x16( D ), - MAJx16( D, E, F ) ) ); + // Round 58 to 61 part 2 + F = _mm512_add_epi32( F, _mm512_add_epi32( BSG2_0x16( G57 ), + MAJx16( G57, H, A ) ) ); + E = _mm512_add_epi32( E, _mm512_add_epi32( BSG2_0x16( F ), + MAJx16( F, G57, H ) ) ); + D = _mm512_add_epi32( D, _mm512_add_epi32( BSG2_0x16( E ), + MAJx16( E, F, G57 ) ) ); + C = _mm512_add_epi32( C, _mm512_add_epi32( BSG2_0x16( D ), + MAJx16( D, E, F ) ) ); - // rounds 62, 63 + // Rounds 62, 63 W[14] = SHA256_16WAY_MEXP( W[12], W[ 7], W[15], W[14] ); W[15] = SHA256_16WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 10fefa8..ce134d6 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -783,29 +783,6 @@ void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data, mm256_ror_64( x, 61 ), \ _mm256_srli_epi64( x, 6 ) ) -#if defined(VL256) -// 4 way is not used whith AVX512 but will be whith AVX10_256 when it -// becomes available. - -#define CH( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xca ) - -#define MAJ( X, Y, Z ) _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 ) - -#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \ -do { \ - __m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \ - __m256i T1 = BSG5_1( E ); \ - __m256i T2 = BSG5_0( A ); \ - T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \ - T1 = _mm256_add_epi64( T1, H ); \ - T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \ - T1 = _mm256_add_epi64( T1, T0 ); \ - D = _mm256_add_epi64( D, T1 ); \ - H = _mm256_add_epi64( T1, T2 ); \ -} while (0) - -#else // AVX2 only - #define CH(X, Y, Z) \ _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) @@ -827,19 +804,12 @@ do { \ H = _mm256_add_epi64( T1, T2 ); \ } while (0) -#endif // AVX512VL AVX10_256 - static void sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] ) { int i; register __m256i A, B, C, D, E, F, G, H; - -#if !defined(VL256) -// Disable for AVX10_256 __m256i X_xor_Y, Y_xor_Z; -#endif - __m256i W[80]; mm256_block_bswap_64( W , in ); @@ -872,10 +842,7 @@ sha512_4x64_round( sha512_4x64_context *ctx, __m256i *in, __m256i r[8] ) H = v256_64( 0x5BE0CD19137E2179 ); } -#if !defined(VL256) -// Disable for AVX10_256 Y_xor_Z = _mm256_xor_si256( B, C ); -#endif for ( i = 0; i < 80; i += 8 ) { diff --git a/build-allarch.sh b/build-allarch.sh index dba2afc..7495ea6 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,7 +4,7 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null +./clean-all.sh # AVX512 SHA VAES: Intel Core Icelake, Rocketlake make distclean || echo clean @@ -18,31 +18,31 @@ strip -s cpuminer mv cpuminer cpuminer-avx512-sha-vaes # Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12 -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl -#make -j $(nproc) -#strip -s cpuminer -#mv cpuminer cpuminer-alderlake +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-alderlake # Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14 # Arrowlake-s includes SHA512, Arrowlake does not? -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl -#make -j $(nproc) -#strip -s cpuminer -#mv cpuminer cpuminer-arrowlake-s +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-arrowlake-s # Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14 # Granitrapids does not build with AVX10, SHA512 or APX. # wait for Diamondrapids & gcc-15. -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl -#make -j $(nproc) -#strip -s cpuminer -#mv cpuminer cpuminer-graniterapids +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-graniterapids # SHA512 AVX10.1 #make clean || echo clean @@ -69,20 +69,20 @@ mv cpuminer cpuminer-avx512-sha-vaes #mv cpuminer cpuminer-diamondrapids # Zen5: AVX512 SHA VAES, requires gcc-14. -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl -#make -j $(nproc) -#strip -s cpuminer -#mv cpuminer cpuminer-zen5 +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-zen5 # Zen4: AVX512 SHA VAES make clean || echo clean rm -f config.status # Zen4: AVX512, SHA, VAES, needs gcc-12.3. -#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl +CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer. -CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl +#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall" ./configure --with-curl make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-zen4 @@ -115,8 +115,8 @@ mv cpuminer cpuminer-avx2-sha-vaes # AVX2 SHA AES: AMD Zen1 make clean || echo done rm -f config.status -#CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl -CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl +CFLAGS="-O3 -march=znver1 -maes -Wall" ./configure --with-curl +#CFLAGS="-O3 -maes -mavx2 -msha -Wall" ./configure --with-curl make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx2-sha diff --git a/clean-all.sh b/clean-all.sh index 1820aff..fd2f57b 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,7 +2,7 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null +rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null diff --git a/configure b/configure index ba07e11..e60f1b4 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.4. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.5. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.4' -PACKAGE_STRING='cpuminer-opt 25.4' +PACKAGE_VERSION='25.5' +PACKAGE_STRING='cpuminer-opt 25.5' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1431,7 +1431,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.4:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.5:";; esac cat <<\_ACEOF @@ -1536,7 +1536,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.4 +cpuminer-opt configure 25.5 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.4, which was +It was created by cpuminer-opt $as_me 25.5, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3591,7 +3591,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.4' + VERSION='25.5' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -5808,11 +5808,11 @@ if test x$ac_prog_cxx_stdcxx = xno then : { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5 printf %s "checking for $CXX option to enable C++11 features... " >&6; } -if test ${ac_cv_prog_cxx_11+y} +if test ${ac_cv_prog_cxx_cxx11+y} then : printf %s "(cached) " >&6 else $as_nop - ac_cv_prog_cxx_11=no + ac_cv_prog_cxx_cxx11=no ac_save_CXX=$CXX cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ @@ -5854,11 +5854,11 @@ if test x$ac_prog_cxx_stdcxx = xno then : { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5 printf %s "checking for $CXX option to enable C++98 features... " >&6; } -if test ${ac_cv_prog_cxx_98+y} +if test ${ac_cv_prog_cxx_cxx98+y} then : printf %s "(cached) " >&6 else $as_nop - ac_cv_prog_cxx_98=no + ac_cv_prog_cxx_cxx98=no ac_save_CXX=$CXX cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ @@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.4, which was +This file was extended by cpuminer-opt $as_me 25.5, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.4 +cpuminer-opt config.status 25.5 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index bcd8c00..87e84ed 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [25.4]) +AC_INIT([cpuminer-opt], [25.5]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index 47fb362..0be54fb 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.4. +# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.5. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation, @@ -601,8 +601,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.4' -PACKAGE_STRING='cpuminer-opt 25.4' +PACKAGE_VERSION='25.5' +PACKAGE_STRING='cpuminer-opt 25.5' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -'configure' configures cpuminer-opt 25.4 to adapt to many kinds of systems. +'configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1424,7 +1424,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.4:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.5:";; esac cat <<\_ACEOF @@ -1528,7 +1528,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.4 +cpuminer-opt configure 25.5 generated by GNU Autoconf 2.72 Copyright (C) 2023 Free Software Foundation, Inc. @@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.4, which was +It was created by cpuminer-opt $as_me 25.5, which was generated by GNU Autoconf 2.72. Invocation command line was $ $0$ac_configure_args_raw @@ -3764,7 +3764,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.4' + VERSION='25.5' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7450,7 +7450,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.4, which was +This file was extended by cpuminer-opt $as_me 25.5, which was generated by GNU Autoconf 2.72. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7518,7 +7518,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.4 +cpuminer-opt config.status 25.5 configured by $0, generated by GNU Autoconf 2.72, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index 0937062..5adfd72 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3760,10 +3760,10 @@ int main(int argc, char *argv[]) #if defined(_WIN32_WINNT) if (opt_debug) - applog( LOG_INFO, "_WIN232_WINNT = 0x%04x", _WIN32_WINNT ); + applog( LOG_INFO, "_WIN32_WINNT = 0x%04x", _WIN32_WINNT ); #else if (opt_debug) - applog( LOG_INFO, "_WIN232_WINNT undefined." ); + applog( LOG_INFO, "_WIN32_WINNT undefined." ); #endif #if defined(WINDOWS_CPU_GROUPS_ENABLED) if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) ) diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 56d78ab..ddfbc6b 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -217,7 +217,9 @@ static inline __m256i mm256_not( const __m256i v ) // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask. // Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements. // Effectively a sign test. - +// The functions return int which can promote small integers to int when used +// in an expression. Users should mask the slack bits strategically to maintain +// data integrity. #define mm256_movmask_64( v ) \ _mm256_movemask_pd( _mm256_castsi256_pd( v ) ) diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 629a41a..fef0197 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -14,12 +14,6 @@ // vectors. It is therefore not technically required for any 512 bit vector // utilities defined below. -// if avx10 // avx512 is always set -// if evex512: yes -// else if avx512 : yes // avx512 is set but not avx10 -// else : no // avx512 not set or avx10.1 is set without evex512 - - #if defined(SIMD512) // AVX512 intrinsics have a few changes from previous conventions. @@ -57,7 +51,7 @@ // - if an argument is to referenced multiple times a C inline function // should be used instead of a macro to prevent an expression argument // from being evaluated multiple times (wasteful) or produces side -// effects (very bad). +// effects (very bad). // // There are 2 areas where overhead is a major concern: constants and // permutations. diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index 7f15323..7063036 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -4,9 +4,10 @@ #if defined(__aarch64__) && defined(__ARM_NEON) // Targeted functions supporting NEON SIMD 128 & 64 bit vectors. -// Element size matters! // -// Intel naming is generally used. +// Intel style naming is generally used, however, this not an attempt to emulate Intel +// intructions. It's focussed on the functions used in this program and the best way +// to implement them with NEON. // // Some advanced logical operations that require SHA3. Prior to GCC-13 // they also require armv8.2