From 8f2f9ec3e956ebfeb2aa335c3965f9bdb148458d Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Sat, 15 Nov 2025 10:44:32 -0500 Subject: [PATCH] v25.7 --- RELEASE_NOTES | 8 + algo/argon2d/blake2/blamka-round-opt.h | 252 ++++++++++++++----------- algo/bmw/bmw512-hash-4way.c | 16 +- algo/hamsi/hamsi-hash-4way.c | 37 ++-- algo/luffa/luffa-hash-2way.c | 8 +- algo/luffa/luffa_for_sse2.c | 6 +- algo/sha/sha512-hash-4way.c | 2 +- algo/sha/sha512-hash.h | 2 +- algo/shabal/shabal-hash-4way.c | 4 +- algo/simd/simd-hash-2way.c | 91 ++++++--- algo/swifftx/swifftx.c | 19 +- armbuild-all.sh | 25 ++- build-allarch.sh | 33 ++-- clean-all.sh | 4 +- configure | 20 +- configure.ac | 2 +- configure~ | 20 +- cpu-miner.c | 167 +++++++--------- miner.h | 2 + simd-utils/simd-128.h | 4 +- simd-utils/simd-256.h | 4 +- simd-utils/simd-512.h | 57 ++++-- simd-utils/simd-neon.h | 48 +---- sysinfos.c | 42 ++--- util.c | 49 ++--- verthash-help.txt | 4 +- 26 files changed, 474 insertions(+), 452 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index ed04fe0..ea833d8 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,14 @@ If not what makes it happen or not happen? Change Log ---------- +v25.7 + +Fixed a bug calculating TTF longer than 1 year. +Faster argon2d. +Faster hamsi AVX512. +Faster switfftx AVX2. +Other small fixes and improvements. + v25.6 Added argon2d1000, argon2d16000 algos. diff --git a/algo/argon2d/blake2/blamka-round-opt.h b/algo/argon2d/blake2/blamka-round-opt.h index 0b4cd78..05b153b 100644 --- a/algo/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2d/blake2/blamka-round-opt.h @@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) #if defined(__SSSE3__) || defined(__ARM_NEON) -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - v128_t t0 = v128_alignr8(B1, B0, 8); \ - v128_t t1 = v128_alignr8(B0, B1, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = v128_alignr8(D1, D0, 8); \ - t1 = v128_alignr8(D0, D1, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) +#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ +{ \ + v128_t t = v128_alignr8( B1, B0, 8 ); \ + B1 = v128_alignr8( B0, B1, 8 ); \ + B0 = t; \ + t = v128_alignr8( D1, D0, 8 ); \ + D0 = v128_alignr8( D0, D1, 8 ); \ + D1 = t; \ +} -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - v128_t t0 = v128_alignr8(B0, B1, 8); \ - v128_t t1 = v128_alignr8(B1, B0, 8); \ - B0 = t0; \ - B1 = t1; \ - \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - \ - t0 = v128_alignr8(D0, D1, 8); \ - t1 = v128_alignr8(D1, D0, 8); \ - D0 = t1; \ - D1 = t0; \ - } while ((void)0, 0) +#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ +{ \ + v128_t t = v128_alignr8( B0, B1, 8 ); \ + B1 = v128_alignr8( B1, B0, 8 ); \ + B0 = t; \ + t = v128_alignr8( D0, D1, 8 ); \ + D0 = v128_alignr8( D1, D0, 8 ); \ + D1 = t; \ +} #else /* SSE2 */ -#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - v128_t t0 = D0; \ - v128_t t1 = B0; \ - D0 = C0; \ - C0 = C1; \ - C1 = D0; \ - D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \ - D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \ - B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1)); \ - B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \ - } while ((void)0, 0) +#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ +{ \ + v128_t t = D0; \ + D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \ + D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \ + t = B0; \ + B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \ + B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \ +} + +#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ +{ \ + v128_t t = B0; \ + B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \ + B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \ + t = D0; \ + D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \ + D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \ +} -#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ - do { \ - v128_t t0, t1; \ - t0 = C0; \ - C0 = C1; \ - C1 = t0; \ - t0 = B0; \ - t1 = D0; \ - B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0)); \ - B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \ - D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1)); \ - D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \ - } while ((void)0, 0) #endif -#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ - do { \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - G1(A0, B0, C0, D0, A1, B1, C1, D1); \ - G2(A0, B0, C0, D0, A1, B1, C1, D1); \ - \ - UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ - } while ((void)0, 0) +#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \ +{ \ + G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \ + G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \ + DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \ + G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \ + G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \ + UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \ +} + #else /* __AVX2__ */ #include @@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ - \ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ @@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ do { \ - __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ - __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ - B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - tmp1 = C0; \ - B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - C0 = C1; \ - tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ - C1 = tmp1; \ + __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \ + __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \ + B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \ + B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \ tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ - D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ + D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \ + D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \ } while(0); #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ - \ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ @@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) do { \ __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ - B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ - tmp1 = C0; \ - B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - C0 = C1; \ + B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \ + B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \ tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ - C1 = tmp1; \ tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ - D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ - D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ + D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \ + D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \ } while((void)0, 0); #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ do{ \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ - \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ } while((void)0, 0); @@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) do{ \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ - G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ - \ + G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \ + G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \ UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ } while((void)0, 0); @@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) #include +/* static inline __m512i muladd(__m512i x, __m512i y) { __m512i z = _mm512_mul_epu32(x, y); return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); } +*/ +#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \ +{ \ + __m512i z0, z1; \ + z0 = _mm512_mul_epu32( A0, B0 ); \ + z1 = _mm512_mul_epu32( A1, B1 ); \ + A0 = _mm512_add_epi64( A0, B0 ); \ + A1 = _mm512_add_epi64( A1, B1 ); \ + z0 = _mm512_add_epi64( z0, z0 ); \ + z1 = _mm512_add_epi64( z1, z1 ); \ + A0 = _mm512_add_epi64( A0, z0 ); \ + A1 = _mm512_add_epi64( A1, z1 ); \ + D0 = _mm512_xor_si512(D0, A0); \ + D1 = _mm512_xor_si512(D1, A1); \ + D0 = _mm512_ror_epi64(D0, 32); \ + D1 = _mm512_ror_epi64(D1, 32); \ + z0 = _mm512_mul_epu32( C0, D0 ); \ + z1 = _mm512_mul_epu32( C1, D1 ); \ + C0 = _mm512_add_epi64( C0, D0 ); \ + C1 = _mm512_add_epi64( C1, D1 ); \ + z0 = _mm512_add_epi64( z0, z0 ); \ + z1 = _mm512_add_epi64( z1, z1 ); \ + C0 = _mm512_add_epi64( C0, z0 ); \ + C1 = _mm512_add_epi64( C1, z1 ); \ + B0 = _mm512_xor_si512(B0, C0); \ + B1 = _mm512_xor_si512(B1, C1); \ + B0 = _mm512_ror_epi64(B0, 24); \ + B1 = _mm512_ror_epi64(B1, 24); \ +} + +#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \ +{ \ + __m512i z0, z1; \ + z0 = _mm512_mul_epu32( A0, B0 ); \ + z1 = _mm512_mul_epu32( A1, B1 ); \ + A0 = _mm512_add_epi64( A0, B0 ); \ + A1 = _mm512_add_epi64( A1, B1 ); \ + z0 = _mm512_add_epi64( z0, z0 ); \ + z1 = _mm512_add_epi64( z1, z1 ); \ + A0 = _mm512_add_epi64( A0, z0 ); \ + A1 = _mm512_add_epi64( A1, z1 ); \ + D0 = _mm512_xor_si512(D0, A0); \ + D1 = _mm512_xor_si512(D1, A1); \ + D0 = _mm512_ror_epi64(D0, 16); \ + D1 = _mm512_ror_epi64(D1, 16); \ + z0 = _mm512_mul_epu32( C0, D0 ); \ + z1 = _mm512_mul_epu32( C1, D1 ); \ + C0 = _mm512_add_epi64( C0, D0 ); \ + C1 = _mm512_add_epi64( C1, D1 ); \ + z0 = _mm512_add_epi64( z0, z0 ); \ + z1 = _mm512_add_epi64( z1, z1 ); \ + C0 = _mm512_add_epi64( C0, z0 ); \ + C1 = _mm512_add_epi64( C1, z1 ); \ + B0 = _mm512_xor_si512(B0, C0); \ + B1 = _mm512_xor_si512(B1, C1); \ + B0 = _mm512_ror_epi64(B0, 63); \ + B1 = _mm512_ror_epi64(B1, 63); \ +} + +/* #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ do { \ A0 = muladd(A0, B0); \ @@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y) B0 = _mm512_ror_epi64(B0, 24); \ B1 = _mm512_ror_epi64(B1, 24); \ } while ((void)0, 0) - +*/ +/* #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ do { \ A0 = muladd(A0, B0); \ @@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y) B0 = _mm512_ror_epi64(B0, 63); \ B1 = _mm512_ror_epi64(B1, 63); \ } while ((void)0, 0) +*/ #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ do { \ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ -\ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ -\ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ } while ((void)0, 0) @@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y) do { \ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ -\ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ -\ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ } while ((void)0, 0) @@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y) do { \ G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G2(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G2(A0, B0, C0, D0, A1, B1, C1, D1); \ -\ UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ } while ((void)0, 0) +static const __m512i swap_q0 = { 0,1, 8,9, 2,3, 10,11 }; +static const __m512i swap_q1 = { 4,5, 12,13, 6,7, 14,15 }; +static const __m512i uswap_q0 = { 0,1, 4,5, 8,9, 12,13 }; +static const __m512i uswap_q1 = { 2,3, 6,7, 10,11, 14,15 }; + #define SWAP_HALVES(A0, A1) \ do { \ __m512i t; \ @@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y) A0 = t; \ } while((void)0, 0) +#define SWAP_QUARTERS(A0, A1) \ +{ \ + __m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \ + A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \ + A0 = t; \ +} + +#define UNSWAP_QUARTERS(A0, A1) \ +{ \ + __m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \ + A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \ + A0 = t; \ +} + +/* #define SWAP_QUARTERS(A0, A1) \ do { \ SWAP_HALVES(A0, A1); \ A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \ A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \ } while((void)0, 0) - +*/ +/* #define UNSWAP_QUARTERS(A0, A1) \ do { \ A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \ A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \ SWAP_HALVES(A0, A1); \ } while((void)0, 0) +*/ #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ do { \ diff --git a/algo/bmw/bmw512-hash-4way.c b/algo/bmw/bmw512-hash-4way.c index e1645d3..9f7ac57 100644 --- a/algo/bmw/bmw512-hash-4way.c +++ b/algo/bmw/bmw512-hash-4way.c @@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) mj[14] = mm256_rol_64( M[14], 15 ); mj[15] = mm256_rol_64( M[15], 16 ); - __m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL ); - const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL ); + __m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL ); + static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL, + 0x0555555555555555ULL, 0x0555555555555555ULL }; qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K ); K = _mm256_add_epi64( K, Kincr ); @@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16], __m512i dH[16] ) { __m512i qt[32], xl, xh; - __m512i mh[16]; + __m512i mh[16], mj[16]; int i; for ( i = 0; i < 16; i++ ) @@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16], qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] ); qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] ); - __m512i mj[16]; - mj[ 0] = mm512_rol_64( M[ 0], 1 ); mj[ 1] = mm512_rol_64( M[ 1], 2 ); mj[ 2] = mm512_rol_64( M[ 2], 3 ); @@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16], mj[14] = mm512_rol_64( M[14], 15 ); mj[15] = mm512_rol_64( M[15], 16 ); - __m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL ); - const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL ); + __m512i K = _mm512_set1_epi64( 0x5555555555555550ULL ); + static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL, + 0x0555555555555555ULL, 0x0555555555555555ULL, + 0x0555555555555555ULL, 0x0555555555555555ULL, + 0x0555555555555555ULL, 0x0555555555555555ULL }; qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K ); K = _mm512_add_epi64( K, Kincr ); diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 87c4bfa..4769bb6 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -503,32 +503,28 @@ do { \ SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \ SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \ s4 = mm512_swap64_32( s4 ); \ - s5 = mm512_swap64_32( s5 ); \ + t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \ sD = mm512_swap64_32( sD ); \ - sE = mm512_swap64_32( sE ); \ - t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \ + t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \ L8( s0, t0, s9, t1 ); \ s6 = mm512_swap64_32( s6 ); \ sF = mm512_swap64_32( sF ); \ - t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \ + t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \ + t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \ L8( s1, t2, sA, t3 ); \ s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \ sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \ \ - s7 = mm512_swap64_32( s7 ); \ - sC = mm512_swap64_32( sC ); \ - t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \ - t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \ + t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \ + t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \ L8( s2, t4, sB, t5 ); \ s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \ sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \ s6 = mm512_swap64_32( s6 ); \ sF = mm512_swap64_32( sF ); \ \ - t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \ + t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \ + t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \ L8( s3, t2, s8, t3 ); \ s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \ s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \ @@ -537,21 +533,20 @@ do { \ s7 = mm512_swap64_32( s7 ); \ sC = mm512_swap64_32( sC ); \ \ - t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \ + t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \ - t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \ + t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \ t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \ t3 = mm512_swap64_32( t3 ); \ L8( t0, t1, t2, t3 ); \ - t3 = mm512_swap64_32( t3 ); \ s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \ - s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \ + s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \ s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \ s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \ - s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \ + s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \ sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \ - s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \ - sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \ + s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \ + sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \ \ t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \ @@ -1268,7 +1263,7 @@ do { \ } while (0) #endif -// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions +// v3, 15 instructions #define SBOX( a, b, c, d ) \ { \ __m256i tb, td; \ @@ -1286,7 +1281,7 @@ do { \ #endif /* -/ v2, 16 instructions, 10 TL equivalent instructions +/ v2, 16 instructions #define SBOX( a, b, c, d ) \ { \ __m256i t = mm256_xorand( d, a, c ); \ diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index ec35ddf..8ef54bb 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = { __m512i t = a0; \ a0 = mm512_xoror( a3, a0, a1 ); \ a2 = _mm512_xor_si512( a2, a3 ); \ - a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ + a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \ a3 = mm512_xorand( a2, a3, t ); \ a2 = mm512_xorand( a1, a2, a0); \ a1 = _mm512_or_si512( a1, a3 ); \ a3 = _mm512_xor_si512( a3, a2 ); \ t = _mm512_xor_si512( t, a1 ); \ a2 = _mm512_and_si512( a2, a1 ); \ - a1 = mm512_xnor( a1, a0 ); \ + a1 = mm512_nxor( a1, a0 ); \ a0 = t; \ } @@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state, __m256i t = a0; \ a0 = mm256_xoror( a3, a0, a1 ); \ a2 = _mm256_xor_si256( a2, a3 ); \ - a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ + a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \ a3 = mm256_xorand( a2, a3, t ); \ a2 = mm256_xorand( a1, a2, a0); \ a1 = _mm256_or_si256( a1, a3 ); \ a3 = _mm256_xor_si256( a3, a2 ); \ t = _mm256_xor_si256( t, a1 ); \ a2 = _mm256_and_si256( a2, a1 ); \ - a1 = mm256_xnor( a1, a0 ); \ + a1 = mm256_nxor( a1, a0 ); \ a0 = t; \ } diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index 070bbf5..99ed86d 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -69,18 +69,18 @@ v128_t t = a0; \ a0 = v128_xoror( a3, a0, a1 ); \ a2 = v128_xor( a2, a3 ); \ - a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ + a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \ a3 = v128_xorand( a2, a3, t ); \ a2 = v128_xorand( a1, a2, a0 ); \ a1 = v128_or( a1, a3 ); \ a3 = v128_xor( a3, a2 ); \ t = v128_xor( t, a1 ); \ a2 = v128_and( a2, a1 ); \ - a1 = v128_xnor( a1, a0 ); \ + a1 = v128_nxor( a1, a0 ); \ a0 = t; \ } -#else +#elif defined(__ARM_NEON) || defined(__SSE2__) #define SUBCRUMB( a0, a1, a2, a3 ) \ { \ diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index ce134d6..62495d5 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -67,7 +67,7 @@ static const uint64_t K512[80] = 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 }; -#if defined(__AVX2__) && defined(__SHA512__) +#if defined(__AVX__) && defined(__SHA512__) // SHA-512 implemented using SHA512 CPU extension. diff --git a/algo/sha/sha512-hash.h b/algo/sha/sha512-hash.h index ffd6535..ccca27b 100644 --- a/algo/sha/sha512-hash.h +++ b/algo/sha/sha512-hash.h @@ -5,7 +5,7 @@ #include "simd-utils.h" #include "sph_sha2.h" -#if defined(__SHA512__) && defined(__AVX2__) +#if defined(__SHA512__) && defined(__AVX__) // Experimental, untested // Need to substitute for sph_sha512 diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index c149a56..7cb40f1 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -305,7 +305,7 @@ do { \ xb0 = mm512_rol_32( xb0, 1 ); \ xa0 = mm512_xor3( xm, xb1, \ mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \ - xb0 = mm512_xnor( xa0, xb0 ); \ + xb0 = mm512_nxor( xa0, xb0 ); \ } while (0) #define PERM_STEP_0_16 do { \ @@ -898,7 +898,7 @@ do { \ xb0 = mm256_rol_32( xb0, 1 ); \ xa0 = mm256_xor3( xm, xb1, \ mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \ - xb0 = mm256_xnor( xa0, xb0 ); \ + xb0 = mm256_nxor( xa0, xb0 ); \ } while (0) #define PERM_STEP_0_8 do { \ diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c index e3e32cb..a7f8511 100644 --- a/algo/simd/simd-hash-2way.c +++ b/algo/simd/simd-hash-2way.c @@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) = {{ -30, 55, -58, -65, -95, -40, -98, 94 }}, }; +#if defined(__AVX2__) + +static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff, + 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff }; +#define V128_00FF _mm256_castsi256_si128( V256_00FF ) + +#elif defined(__SSE2__) || defined(__ARM_NEON ) + +static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff }; + +#endif + +#if defined(SIMD512) + +static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101, + 0x0101010101010101, 0x0101010101010101, + 0x0101010101010101, 0x0101010101010101, + 0x0101010101010101, 0x0101010101010101 }; +#define V256_0101 _mm512_castsi512_si256( V512_0101 ) +#define V128_0101 _mm512_castsi512_si128( V512_0101 ) + + +static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080, + 0x0080008000800080, 0x0080008000800080, + 0x0080008000800080, 0x0080008000800080, + 0x0080008000800080, 0x0080008000800080 }; +#define V256_0080 _mm512_castsi512_si256( V512_0080 ) +#define V128_0080 _mm512_castsi512_si128( V512_0080 ) + +#elif defined(__AVX2__) + +static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101, + 0x0101010101010101, 0x0101010101010101 }; +#define V128_0101 _mm256_castsi256_si128( V256_0101 ) + +static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080, + 0x0080008000800080, 0x0080008000800080 }; +#define V128_0080 _mm256_castsi256_si128( V256_0080 ) + +#elif defined(__SSE2__) || defined(__ARM_NEON ) + +static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 }; + +static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 }; + +#endif + #if defined(__x86_64__) #define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1) @@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) = #define shufxor(x,s) XCAT(SHUFXOR_,s)(x) #define REDUCE(x) \ - v128_sub16( v128_and( x, v128_64( \ - 0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) ) + v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) ) #define EXTRA_REDUCE_S(x)\ - v128_sub16( x, v128_and( \ - v128_64( 0x0101010101010101 ), \ - v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) ) + v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) ) #define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) @@ -293,10 +337,9 @@ do { \ // This will make the full FFT_64 in order. #define INTERLEAVE(i,j) \ do { \ - v128u16_t t1= X(i); \ - v128u16_t t2= X(j); \ - X(i) = v128_unpacklo16( t1, t2 ); \ - X(j) = v128_unpackhi16( t1, t2 ); \ + v128u16_t t = X(i); \ + X(i) = v128_unpacklo16( t, X(j) ); \ + X(j) = v128_unpackhi16( t, X(j) ); \ } while(0) INTERLEAVE( 1, 0 ); @@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] = #define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x) -#if defined(VL256) - #define REDUCE(x) \ - _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \ - _mm256_srai_epi16( x, 8 ) ) -#else - -#define REDUCE(x) \ - _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \ - 0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) ) - -#endif + _mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) ) #define EXTRA_REDUCE_S(x)\ - _mm256_sub_epi16( x, _mm256_and_si256( \ - _mm256_set1_epi64x( 0x0101010101010101 ), \ - _mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) ) + _mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \ + _mm256_cmpgt_epi16( x, V256_0080 ) ) ) #define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) @@ -917,10 +949,9 @@ do { \ // This will make the full FFT_64 in order. #define INTERLEAVE(i,j) \ do { \ - __m256i t1= X(i); \ - __m256i t2= X(j); \ - X(i) = _mm256_unpacklo_epi16( t1, t2 ); \ - X(j) = _mm256_unpackhi_epi16( t1, t2 ); \ + __m256i t = X(i); \ + X(i) = _mm256_unpacklo_epi16( t, X(j) ); \ + X(j) = _mm256_unpackhi_epi16( t, X(j) ); \ } while(0) INTERLEAVE( 1, 0 ); @@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] = _mm512_srai_epi16( x, 8 ) ) #define EXTRA_REDUCE_S4w(x) \ - _mm512_sub_epi16( x, _mm512_and_si512( \ - _mm512_set1_epi64( 0x0101010101010101 ), \ - _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \ - x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) ) + _mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \ + _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) ) // generic, except it calls targetted macros #define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) ) diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c index 7bd2d2a..9730e08 100644 --- a/algo/swifftx/swifftx.c +++ b/algo/swifftx/swifftx.c @@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output ) #if defined(__AVX2__) __m256i F0, F1, F2, F3, F4, F5, F6, F7; - __m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] ); + __m256i *table = (__m256i*)fftTable; + __m256i tbl = table[ input[0] ]; __m256i *mul = (__m256i*)multipliers; __m256i *out = (__m256i*)output; F0 = _mm256_mullo_epi32( mul[0], tbl ); - tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] ); + tbl = table[ input[1] ]; F1 = _mm256_mullo_epi32( mul[1], tbl ); - tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] ); + tbl = table[ input[2] ]; F2 = _mm256_mullo_epi32( mul[2], tbl ); - tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] ); + tbl = table[ input[3] ]; F3 = _mm256_mullo_epi32( mul[3], tbl ); - tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] ); + tbl = table[ input[4] ]; F4 = _mm256_mullo_epi32( mul[4], tbl ); - tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] ); + tbl = table[ input[5] ]; F5 = _mm256_mullo_epi32( mul[5], tbl ); - tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] ); + tbl = table[ input[6] ]; F6 = _mm256_mullo_epi32( mul[6], tbl ); - tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] ); + tbl = table[ input[7] ]; F7 = _mm256_mullo_epi32( mul[7], tbl ); #define ADD_SUB( a, b ) \ @@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output ) ADD_SUB( F1, F3 ); ADD_SUB( F4, F6 ); ADD_SUB( F5, F7 ); - F5 = _mm256_slli_epi32( F5, 2 ); F6 = _mm256_slli_epi32( F6, 4 ); F7 = _mm256_slli_epi32( F7, 6 ); + F5 = _mm256_slli_epi32( F5, 2 ); ADD_SUB( F0, F4 ); ADD_SUB( F1, F5 ); ADD_SUB( F2, F6 ); diff --git a/armbuild-all.sh b/armbuild-all.sh index e77cf79..e2ebbe5 100755 --- a/armbuild-all.sh +++ b/armbuild-all.sh @@ -4,11 +4,11 @@ # during development. However, the information contained may provide compilation # tips to users. -rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null +rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null # armv9 needs gcc-13 # -march-armv9-a includes SVE2 but no crypto -# -march=armv9-a+crypto adds AES & SHA2 but not SHA512 +# -march=armv9-a+crypto adds AES & SHA256 but not SHA512 make distclean || echo clean rm -f config.status @@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-c make -j $(nproc) mv cpuminer cpuminer-armv9 +# Apple M4: armv9.2, AES, SHA3, SVE2 +make clean || echo clean +CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +mv cpuminer cpuminer-m4 + +# Apple M2: armv8.6, AES, SHA3 +make clean || echo clean +CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +mv cpuminer cpuminer-m2 + # SVE2 available in armv8.5 make clean || echo clean CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl make -j $(nproc) mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2 -# SHA3 available in armv8.4 +# Apple M1: armv8.4 AES, SHA3 make clean || echo clean CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl make -j $(nproc) mv cpuminer cpuminer-armv8.4-crypto-sha3 +# Cortex-A76 (Orange Pi 5) +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl +make -j $(nproc) +mv cpuminer cpuminer-armv8-crypto + make clean || echo clean rm -f config.status CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl diff --git a/build-allarch.sh b/build-allarch.sh index 7495ea6..1a638e5 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -34,9 +34,7 @@ make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-arrowlake-s -# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14 -# Granitrapids does not build with AVX10, SHA512 or APX. -# wait for Diamondrapids & gcc-15. +# Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl @@ -44,13 +42,13 @@ make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-graniterapids -# SHA512 AVX10.1 -#make clean || echo clean -#rm -f config.status -#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl -#make -j $(nproc) -#strip -s cpuminer -#mv cpuminer cpuminer-avx10_1 +# Graniterapids + SHA512, AVX10.1 +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-avx10.1 # SHA512 AVX10.2 #make clean || echo clean @@ -72,6 +70,9 @@ mv cpuminer cpuminer-graniterapids make clean || echo clean rm -f config.status CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl +# zen4 is close enough for older compiler +#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl + make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-zen5 @@ -138,13 +139,21 @@ make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-avx -# SSE4.2 AES: Intel Westmere, most Pentium & Celeron +# SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl +make -j $(nproc) +strip -s cpuminer +mv cpuminer cpuminer-sse42-aes-sha + +# SSE4.2 AES: Intel Westmere, older Pentium & Celeron make clean || echo clean rm -f config.status CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl make -j $(nproc) strip -s cpuminer -mv cpuminer cpuminer-aes-sse42 +mv cpuminer cpuminer-sse42-aes # SSE4.2: Intel Nehalem make clean || echo clean diff --git a/clean-all.sh b/clean-all.sh index fd2f57b..e4350fd 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,8 +2,8 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null +rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null -rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null +rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null make distclean > /dev/null diff --git a/configure b/configure index 1289149..31f3db9 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.7. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.6' -PACKAGE_STRING='cpuminer-opt 25.6' +PACKAGE_VERSION='25.7' +PACKAGE_STRING='cpuminer-opt 25.7' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1431,7 +1431,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.6:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.7:";; esac cat <<\_ACEOF @@ -1536,7 +1536,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.6 +cpuminer-opt configure 25.7 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.6, which was +It was created by cpuminer-opt $as_me 25.7, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3591,7 +3591,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.6' + VERSION='25.7' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.6, which was +This file was extended by cpuminer-opt $as_me 25.7, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.6 +cpuminer-opt config.status 25.7 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index ad237ab..666cba3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [25.6]) +AC_INIT([cpuminer-opt], [25.7]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index aba559b..7d54f46 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6. +# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.7. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation, @@ -601,8 +601,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.6' -PACKAGE_STRING='cpuminer-opt 25.6' +PACKAGE_VERSION='25.7' +PACKAGE_STRING='cpuminer-opt 25.7' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems. +'configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1424,7 +1424,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.6:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.7:";; esac cat <<\_ACEOF @@ -1528,7 +1528,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.6 +cpuminer-opt configure 25.7 generated by GNU Autoconf 2.72 Copyright (C) 2023 Free Software Foundation, Inc. @@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.6, which was +It was created by cpuminer-opt $as_me 25.7, which was generated by GNU Autoconf 2.72. Invocation command line was $ $0$ac_configure_args_raw @@ -3768,7 +3768,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.6' + VERSION='25.7' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7581,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.6, which was +This file was extended by cpuminer-opt $as_me 25.7, which was generated by GNU Autoconf 2.72. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7649,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.6 +cpuminer-opt config.status 25.7 configured by $0, generated by GNU Autoconf 2.72, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index 5adfd72..463744e 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -921,40 +921,33 @@ out: return rc; } -// returns the unit prefix and the hashrate appropriately scaled. -void scale_hash_for_display ( double* hashrate, char* prefix ) -{ - if ( *hashrate < 1e4 ) *prefix = 0; - else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; } - else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; } - else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; } - else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; } - else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; } - else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; } - else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; } - else { *prefix = 'Y'; *hashrate /= 1e24; } -} - +// Does not account for leap years. static inline void sprintf_et( char *str, long unsigned int seconds ) { - long unsigned int min = seconds / 60; - long unsigned int sec = seconds % 60; - long unsigned int hrs = min / 60; - - if ( unlikely( hrs ) ) + long unsigned int minutes = seconds / 60; + if ( minutes ) { - long unsigned int days = hrs / 24; - long unsigned int years = days / 365; - if ( years ) // 0y000d - sprintf( str, "%luy%lud", years, years % 365 ); - else if ( days ) // 0d00h - sprintf( str, "%lud%02luh", days, hrs % 24 ); - else // 0h00m - sprintf( str, "%luh%02lum", hrs, min % 60 ); + long unsigned int hours = minutes / 60; + if ( hours ) + { + long unsigned int days = hours / 24; + if ( days ) + { + long unsigned int years = days / 365; + if ( years ) + sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d + else + sprintf( str, "%lud%02luh", days, hours % 24 ); // 0d00h + } + else + sprintf( str, "%luh%02lum", hours, minutes % 60 ); // 0h00m + } + else + sprintf( str, "%lum%02lus", minutes, seconds % 60 ); // 0m00s } - else // 0m00s - sprintf( str, "%lum%02lus", min, sec ); -} + else + sprintf( str, "%lus", seconds ); // 0s +} const long double exp32 = EXP32; // 2**32 const long double exp48 = EXP32 * EXP16; // 2**48 @@ -2833,67 +2826,29 @@ static void show_credits() static bool cpu_capability( bool display_only ) { char cpu_brand[0x40]; - bool cpu_has_sse2 = has_sse2(); // X86_64 only - bool cpu_has_ssse3 = has_ssse3(); // X86_64 only - bool cpu_has_sse41 = has_sse41(); // X86_64 only - bool cpu_has_sse42 = has_sse42(); - bool cpu_has_avx = has_avx(); - bool cpu_has_neon = has_neon(); // AArch64 - bool cpu_has_sve = has_sve(); // aarch64 only, insignificant - bool cpu_has_sve2 = has_sve2(); // AArch64 only - bool cpu_has_sme = has_sme(); - bool cpu_has_sme2 = has_sme2(); - bool cpu_has_avx2 = has_avx2(); - bool cpu_has_avx512 = has_avx512(); - bool cpu_has_avx10 = has_avx10(); - bool cpu_has_aes = has_aes(); // x86_64 or AArch64 - bool cpu_has_vaes = has_vaes(); // X86_64 only - bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64 - bool cpu_has_sha512 = has_sha512(); bool sw_has_x86_64 = false; bool sw_has_aarch64 = false; int sw_arm_arch = 0; // AArch64 version bool sw_has_neon = false; // AArch64 - bool sw_has_sve = false; // AArch64 - bool sw_has_sve2 = false; // AArch64 + bool sw_has_sve = false; + bool sw_has_sve2 = false; bool sw_has_sme = false; bool sw_has_sme2 = false; bool sw_has_sse2 = false; // x86_64 - bool sw_has_ssse3 = false; // x86_64 - bool sw_has_sse41 = false; // x86_64 + bool sw_has_ssse3 = false; + bool sw_has_sse41 = false; bool sw_has_sse42 = false; bool sw_has_avx = false; bool sw_has_avx2 = false; bool sw_has_avx512 = false; bool sw_has_avx10 = false; - bool sw_has_aes = false; - bool sw_has_vaes = false; + bool sw_has_amx = false; + bool sw_has_apx = false; + bool sw_has_aes = false; // x86_64 or AArch64 + bool sw_has_vaes = false; // x86_64 bool sw_has_sha256 = false; // x86_64 or AArch64 - bool sw_has_sha512 = false; // x86_64 or AArch64 -/* - set_t algo_features = algo_gate.optimizations; - bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); - bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); - bool algo_has_avx = set_incl( AVX_OPT, algo_features ); - bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); - bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); - bool algo_has_aes = set_incl( AES_OPT, algo_features ); - bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); - bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features ); - bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features ); - bool algo_has_neon = set_incl( NEON_OPT, algo_features ); - bool use_sse2; - bool use_sse42; - bool use_avx; - bool use_avx2; - bool use_avx512; - bool use_aes; - bool use_vaes; - bool use_sha256; - bool use_sha512; - bool use_neon; - bool use_none; -*/ + bool sw_has_sha512 = false; + #if defined(__x86_64__) sw_has_x86_64 = true; #elif defined(__aarch64__) @@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only ) #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) sw_has_avx512 = true; #endif -// AVX10 version is not significant as of AVX10.2. If that changes use a better -// way to test the version than sequentially. -// #if defined(__AVX10_2__) -// -// #elif defined(__AVX10_1__) - #if defined(__AVX10_1__) + #if defined(__AVX10_1__) // version is not significant sw_has_avx10 = true; #endif + #ifdef __AMX_TILE__ + sw_has_amx = true; + #endif + #ifdef __APX_F__ + sw_has_apx = true; + #endif // x86_64 or AArch64 #if defined(__AES__) || defined(__ARM_FEATURE_AES) @@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only ) #if defined(__ARM_NEON) sw_has_neon = true; #endif + // FYI, SVE & SME not used by cpuminer #if defined(__ARM_FEATURE_SVE) sw_has_sve = true; #endif @@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only ) // Build printf( "SW built on " __DATE__ #if defined(__clang__) - " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, - __clang_patchlevel__ ); + " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ ); #elif defined(__GNUC__) " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ ); #endif @@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only ) printf("CPU features: "); if ( cpu_arch_x86_64() ) { - if ( cpu_has_avx10 ) printf( " AVX10.%d", avx10_version() ); - if ( cpu_has_avx512 ) printf( " AVX512" ); - else if ( cpu_has_avx2 ) printf( " AVX2 " ); - else if ( cpu_has_avx ) printf( " AVX " ); - else if ( cpu_has_sse42 ) printf( " SSE4.2" ); - else if ( cpu_has_sse41 ) printf( " SSE4.1" ); - else if ( cpu_has_ssse3 ) printf( " SSSE3 " ); - else if ( cpu_has_sse2 ) printf( " SSE2 " ); + if ( has_avx10() ) printf( " AVX10.%d", avx10_version() ); + else if ( has_avx512() ) printf( " AVX512" ); + else if ( has_avx2() ) printf( " AVX2 " ); + else if ( has_avx() ) printf( " AVX " ); + else if ( has_sse42() ) printf( " SSE4.2" ); + else if ( has_sse41() ) printf( " SSE4.1" ); + else if ( has_ssse3() ) printf( " SSSE3 " ); + else if ( has_sse2() ) printf( " SSE2 " ); + if ( has_amx() ) printf( " AMX" ); + if ( has_apx_f() ) printf( " APX" ); + } else if ( cpu_arch_aarch64() ) { - if ( cpu_has_neon ) printf( " NEON" ); - if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() ); - else if ( cpu_has_sve ) printf( " SVE" ); - if ( cpu_has_sme2 ) printf( " SME2" ); - else if ( cpu_has_sme ) printf( " SME" ); + if ( has_neon() ) printf( " NEON" ); + if ( has_sve2() ) printf( " SVE2-%d", sve_vector_length() ); + else if ( has_sve() ) printf( " SVE" ); + if ( has_sme2() ) printf( " SME2" ); + else if ( has_sme() ) printf( " SME" ); } - if ( cpu_has_vaes ) printf( " VAES" ); - else if ( cpu_has_aes ) printf( " AES" ); - if ( cpu_has_sha512 ) printf( " SHA512" ); - else if ( cpu_has_sha256 ) printf( " SHA256" ); + if ( has_vaes() ) printf( " VAES" ); + else if ( has_aes() ) printf( " AES" ); + if ( has_sha512() ) printf( " SHA512" ); + else if ( has_sha256() ) printf( " SHA256" ); printf("\nSW features: "); if ( sw_has_x86_64 ) @@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only ) else if ( sw_has_sse41 ) printf( " SSE4.1" ); else if ( sw_has_ssse3 ) printf( " SSSE3 " ); else if ( sw_has_sse2 ) printf( " SSE2 " ); + if ( sw_has_amx ) printf( " AMX" ); + if ( sw_has_apx ) printf( " APX" ); } else if ( sw_has_aarch64 ) { diff --git a/miner.h b/miner.h index db0c123..59441ac 100644 --- a/miner.h +++ b/miner.h @@ -542,7 +542,9 @@ void applog_hash(void *hash); void format_hashrate(double hashrate, char *output); void print_hash_tests(void); +// Factors of 1000 used for hashes, ie kH/s, Mh/s. void scale_hash_for_display ( double* hashrate, char* units ); +// Factors of 1024 used for bytes, ie kiB, MiB. void format_number_si( double* hashrate, char* si_units ); void report_summary_log( bool force ); diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 3b7d56d..b05c1d7 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -447,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 ) // ~( a ^ b ), same as (~a) ^ b -#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 ) +#define v128_nxor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 ) #else @@ -469,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) ) -#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) ) +#define v128_nxor( a, b ) v128_not( _mm_xor_si128( a, b ) ) #endif diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index ddfbc6b..8693322 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v ) #define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 ) // ~( a ^ b ), same as (~a) ^ b -#define mm256_xnor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 ) +#define mm256_nxor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 ) #else @@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v ) #define mm256_orand( a, b, c ) \ _mm256_or_si256( a, _mm256_and_si256( b, c ) ) -#define mm256_xnor( a, b ) \ +#define mm256_nxor( a, b ) \ mm256_not( _mm256_xor_si256( a, b ) ) #endif diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index fef0197..32753f3 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -18,8 +18,13 @@ // AVX512 intrinsics have a few changes from previous conventions. // -// "_mm512_cmp" instructions now returns a bitmask instead of a vector mask. -// This removes the need for an explicit movemask instruction. +// "_mm512_cmp" instructions now return a bitmask instead of a vector mask. +// This removes the need for an explicit movemask instruction. It is also +// slower than AVX2 cmp. There is no version of AVX512 cmp that returns a +// vector result resulting in a double penalty if a vector result is needed: +// slower cmp instruction & extra instruction to convert bit mask into +// vector mask. 256 bit & 128 bit still have legacy cmp returning vector +// while AVX512VL adds masked versions returning bit mask. // // Many previously sizeless (si) instructions now have sized (epi) versions // to accomodate masking packed elements. @@ -30,7 +35,7 @@ // list. // // "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other -// AVX512 permutes can cross all lanes. +// AVX512 instructions using the permute name can cross all lanes. // // New alignr instructions for epi64 and epi32 operate across the entire // vector but slower than epi8 which continues to be restricted to 128 bit @@ -50,16 +55,17 @@ // parentheses to ensure the expression argument is evaluated first. // - if an argument is to referenced multiple times a C inline function // should be used instead of a macro to prevent an expression argument -// from being evaluated multiple times (wasteful) or produces side +// from being evaluated multiple times (wasteful) or produce side // effects (very bad). // // There are 2 areas where overhead is a major concern: constants and // permutations. // // Constants need to be composed at run time by assembling individual -// elements, very expensive. The cost is proportional to the number of -// different elements therefore use the largest element size possible, -// merge smaller integer elements to 64 bits, and group repeated elements. +// elements or loaded from memory, very expensive. The cost of runtime +// construction is proportional to the number of different elements +// therefore use the largest element size possible merging smaller integer +// elements to 64 bits, and group repeated elements. // // Constants with repeating patterns can be optimized with the smaller // patterns repeated more frequently being more efficient. @@ -67,14 +73,15 @@ // Some specific constants can be very efficient. Zero is very efficient, // 1 and -1 slightly less so. // -// If an expensive constant is to be reused in the same function it should -// be declared as a local variable defined once and reused. +// If an expensive constant is to be reused in the same function it may +// be declared as a local variable defined once and reused. If frequently +// used it can be declared as a static const in memory. // // Permutations can be very expensive if they use a vector control index, -// even if the permutation itself is quite efficient. -// The index is essentially a constant with all the baggage that brings. -// The same rules apply, if an index is to be reused it should be defined -// as a local. This applies specifically to bswap operations. +// even if the permute instruction itself is quite efficient. +// The index is essentially a vector constant with all the baggage that +// brings. The same rules apply, if an index is to be reused it should either +// be defined as a local or static const. // // Permutations that cross 128 bit lanes are typically slower and often need // a vector control index. If the permutation doesn't need to cross 128 bit @@ -221,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 ) // ~( a ^ b ), (~a) ^ b -#define mm512_xnor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 ) +#define mm512_nxor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 ) // ~( a & b ) #define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef ) @@ -241,6 +248,15 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_ror_32 _mm512_ror_epi32 #define mm512_rol_32 _mm512_rol_epi32 +/* not used +#if defined(__AVX512VBMI2__) + +#define mm512_ror_16( v, c ) _mm512_shrdi_epi16( c, v, v ) +#define mm512_rol_16( v, c ) _mm512_shldi_epi16( c, v, v ) + +#endif +*/ + // // Reverse byte order of packed elements, vectorized endian conversion. @@ -249,9 +265,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 ) /* not used +#if defined(__AVX512VBMI2__) + +#define mm512_bswap_16( v ) mm512_ror_16( v, 8 ) + +#else + #define mm512_bswap_16( v ) \ _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \ - 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) + 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) + +#endif */ #define mm512_bswap_16( v ) \ @@ -431,8 +455,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c ) _mm512_castsi512_ps( v2 ), c ) ); // 64 bit lanes -// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE. - +// Redundant with ror & rol but included for consistency with AVX2/SSE. #define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 ) #define mm512_swap64_32 mm512_qrev32 // grandfathered diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index a1816fc..7ae8de0 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -126,7 +126,7 @@ #define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 ) // ~( v1 ^ v0 ), same as (~v1) ^ v0 -#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) +#define v128_nxor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) // ~v1 | v0, args reversed for consistency with x86_64 #define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) @@ -349,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) ) // Bit rotation -/* -#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 ) -#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 ) -#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 ) -#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 ) -#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 ) -#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 ) -#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 ) -#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 ) - -#define v128_ror64( v, c ) \ - ( (c) == 8 ) ? v128_shuflr64_8( v ) \ - : ( (c) == 16 ) ? v128_shuflr64_16( v ) \ - : ( (c) == 24 ) ? v128_shuflr64_24( v ) \ - : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ - : ( (c) == 40 ) ? v128_shufll64_24( v ) \ - : ( (c) == 48 ) ? v128_shufll64_16( v ) \ - : ( (c) == 56 ) ? v128_shufll64_8( v ) \ - : vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ - ((uint64x2_t)(v)), c ) - -#define v128_rol64( v, c ) \ - ( (c) == 8 ) ? v128_shufll64_8( v ) \ - : ( (c) == 16 ) ? v128_shufll64_16( v ) \ - : ( (c) == 24 ) ? v128_shufll64_24( v ) \ - : ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ - : ( (c) == 40 ) ? v128_shuflr64_24( v ) \ - : ( (c) == 48 ) ? v128_shuflr64_16( v ) \ - : ( (c) == 56 ) ? v128_shuflr64_8( v ) \ - : vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \ - ((uint64x2_t)(v)), c ) - -#define v128_ror32( v, c ) \ - ( (c) == 8 ) ? v128_shuflr32_8( v ) \ - : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ - : ( (c) == 24 ) ? v128_shufll32_8( v ) \ - : vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ - ((uint32x4_t)(v)), c ) - -#define v128_rol32( v, c ) \ - ( (c) == 8 ) ? v128_shufll32_8( v ) \ - : ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \ - : ( (c) == 24 ) ? v128_shuflr32_8( v ) \ - : vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \ - ((uint32x4_t)(v)), c ) -*/ #define v128_ror64( v, c ) \ ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ diff --git a/sysinfos.c b/sysinfos.c index dc1edf3..0fbf7a3 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -474,6 +474,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) } /* +// ARM feature compiler flags #ifdef __aarch64__ #warning "__aarch64__" #endif @@ -509,16 +510,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) #endif */ -// Typical display format: AVX10.[version]_[vectorlength], if vector length is -// omitted 256 is the default. -// Ex: AVX10.1_512 -// Flags: -// AVX10 128 256 512 -// 0 0 0 0 = AVX10 not supported -// 1 1 1 0 = AVX10 256 bit max (version 2) -// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids) -// Other combinations are not defined. - static inline bool cpu_arch_x86_64() { #if defined(__x86_64__) @@ -766,6 +757,17 @@ static inline bool has_vbmi2() #endif } +static inline bool has_amx() +{ +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ EDX_Reg ] & AMX_TILE_Flag; +#else + return false; +#endif +} + static inline bool has_aes() { #if defined(__x86_64__) @@ -815,12 +817,9 @@ static inline bool has_sveaes() static inline bool has_sha256() { #if defined(__x86_64__) - if ( has_avx() ) - { unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 0, cpu_info ); return cpu_info[ EBX_Reg ] & SHA_Flag; - } return false; #elif defined(__aarch64__) && defined(HWCAP_SHA2) // NEON SHA256 @@ -835,7 +834,7 @@ static inline bool has_sha256() static inline bool has_sha512() { #if defined(__x86_64__) - if ( has_avx2() ) + if ( has_avx() ) { unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 1, cpu_info ); @@ -852,7 +851,6 @@ static inline bool has_sha512() #endif } -// Arm only static inline bool has_sha3() { #if defined(__aarch64__) && defined(HWCAP_SHA3) @@ -944,16 +942,6 @@ static inline int sve_vector_length() return 0; } -// Assume min_vlen refers to the register size -static inline int rvv_vector_length() -{ -#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen) - return __riscv_v_min_vlen; -#endif - return 0; -} - -// generic static inline int vector_length() { #if defined(__x86_64__) @@ -965,8 +953,8 @@ static inline int vector_length() return has_sve() ? sve_vector_length() : has_neon() ? 128 : 0; -#elif defined(__riscv) && defined(__riscv_vector) - return rvv_vector_length(); +#elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen) + return __riscv_v_min_vlen; #endif return 0; } diff --git a/util.c b/util.c index 3f95af5..65d673f 100644 --- a/util.c +++ b/util.c @@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0) free(cmd); } - -void format_hashrate(double hashrate, char *output) +// Decimal SI, factors 0f 1000 +void scale_hash_for_display ( double* hashrate, char* prefix ) { - char prefix = '\0'; - - if (hashrate < 10000) { - // nop - } - else if (hashrate < 1e7) { - prefix = 'k'; - hashrate *= 1e-3; - } - else if (hashrate < 1e10) { - prefix = 'M'; - hashrate *= 1e-6; - } - else if (hashrate < 1e13) { - prefix = 'G'; - hashrate *= 1e-9; - } - else { - prefix = 'T'; - hashrate *= 1e-12; - } - - sprintf( - output, - prefix ? "%.2f %cH/s" : "%.2f H/s%c", - hashrate, prefix - ); + if ( *hashrate < 1e4 ) *prefix = 0; + else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; } + else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; } + else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; } + else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; } + else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; } + else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; } + else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; } + else { *prefix = 'Y'; *hashrate /= 1e24; } } -// For use with MiB etc +void format_hashrate( double hashrate, char *output ) +{ + char prefix = '\0'; + scale_hash_for_display( &hashrate, &prefix ); + sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix ); +} + +// Binary SI, factors of 1024 void format_number_si( double* n, char* si_units ) { if ( *n < 1024*10 ) { *si_units = 0; return; } diff --git a/verthash-help.txt b/verthash-help.txt index c722895..edcb01d 100644 --- a/verthash-help.txt +++ b/verthash-help.txt @@ -55,9 +55,9 @@ the current directory it will be created. Data file creation can take up to 30 minutes on a spinning hard drive. Once created the new data file will be verified and used immediately -if a valid url and user were included on the command line. +if a valid url and user was included on the command line. -A default data file can be created by ommitting the url option. That will +A default data file can also be created by ommitting the url option. That will either verify an existing default data file or create one and verify it, then exit.