This commit is contained in:
Jay D Dee
2025-11-15 10:44:32 -05:00
parent 12480a3ea5
commit 8f2f9ec3e9
26 changed files with 474 additions and 452 deletions

View File

@@ -75,6 +75,14 @@ If not what makes it happen or not happen?
Change Log Change Log
---------- ----------
v25.7
Fixed a bug calculating TTF longer than 1 year.
Faster argon2d.
Faster hamsi AVX512.
Faster switfftx AVX2.
Other small fixes and improvements.
v25.6 v25.6
Added argon2d1000, argon2d16000 algos. Added argon2d1000, argon2d16000 algos.

View File

@@ -67,81 +67,59 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON)
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ #define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
do { \ { \
v128_t t0 = v128_alignr8(B1, B0, 8); \ v128_t t = v128_alignr8( B1, B0, 8 ); \
v128_t t1 = v128_alignr8(B0, B1, 8); \ B1 = v128_alignr8( B0, B1, 8 ); \
B0 = t0; \ B0 = t; \
B1 = t1; \ t = v128_alignr8( D1, D0, 8 ); \
\ D0 = v128_alignr8( D0, D1, 8 ); \
t0 = C0; \ D1 = t; \
C0 = C1; \ }
C1 = t0; \
\
t0 = v128_alignr8(D1, D0, 8); \
t1 = v128_alignr8(D0, D1, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ #define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
do { \ { \
v128_t t0 = v128_alignr8(B0, B1, 8); \ v128_t t = v128_alignr8( B0, B1, 8 ); \
v128_t t1 = v128_alignr8(B1, B0, 8); \ B1 = v128_alignr8( B1, B0, 8 ); \
B0 = t0; \ B0 = t; \
B1 = t1; \ t = v128_alignr8( D0, D1, 8 ); \
\ D0 = v128_alignr8( D1, D0, 8 ); \
t0 = C0; \ D1 = t; \
C0 = C1; \ }
C1 = t0; \
\
t0 = v128_alignr8(D0, D1, 8); \
t1 = v128_alignr8(D1, D0, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#else /* SSE2 */ #else /* SSE2 */
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ #define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
do { \ { \
v128_t t0 = D0; \ v128_t t = D0; \
v128_t t1 = B0; \ D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
D0 = C0; \ D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
C0 = C1; \ t = B0; \
C1 = D0; \
D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \
D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \
B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \ B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \ B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
} while ((void)0, 0) }
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \ #define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
do { \ { \
v128_t t0, t1; \ v128_t t = B0; \
t0 = C0; \
C0 = C1; \
C1 = t0; \
t0 = B0; \
t1 = D0; \
B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \ B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \ B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
t = D0; \
D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \ D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \ D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
} while ((void)0, 0) }
#endif #endif
#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \ #define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
do { \ { \
G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \ G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \ G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
\
DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \ DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
\ G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \ UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
} while ((void)0, 0) }
#else /* __AVX2__ */ #else /* __AVX2__ */
#include <immintrin.h> #include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \ do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
tmp1 = C0; \ B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
} while(0); } while(0);
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do { \ do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
tmp1 = C0; \ B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
} while((void)0, 0); } while((void)0, 0);
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \ do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
} while((void)0, 0); } while((void)0, 0);
@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do{ \ do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
\ G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
} while((void)0, 0); } while((void)0, 0);
@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#include <immintrin.h> #include <immintrin.h>
/*
static inline __m512i muladd(__m512i x, __m512i y) static inline __m512i muladd(__m512i x, __m512i y)
{ {
__m512i z = _mm512_mul_epu32(x, y); __m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
} }
*/
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 32); \
D1 = _mm512_ror_epi64(D1, 32); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
}
#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 16); \
D1 = _mm512_ror_epi64(D1, 16); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
}
/*
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \ do { \
A0 = muladd(A0, B0); \ A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 24); \ B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \ B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0) } while ((void)0, 0)
*/
/*
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \ do { \
A0 = muladd(A0, B0); \ A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 63); \ B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \ B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0) } while ((void)0, 0)
*/
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \ do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
} while ((void)0, 0) } while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \ do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
} while ((void)0, 0) } while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \ do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \ G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \ G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0) } while ((void)0, 0)
static const __m512i swap_q0 = { 0,1, 8,9, 2,3, 10,11 };
static const __m512i swap_q1 = { 4,5, 12,13, 6,7, 14,15 };
static const __m512i uswap_q0 = { 0,1, 4,5, 8,9, 12,13 };
static const __m512i uswap_q1 = { 2,3, 6,7, 10,11, 14,15 };
#define SWAP_HALVES(A0, A1) \ #define SWAP_HALVES(A0, A1) \
do { \ do { \
__m512i t; \ __m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
A0 = t; \ A0 = t; \
} while((void)0, 0) } while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
A0 = t; \
}
#define UNSWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
A0 = t; \
}
/*
#define SWAP_QUARTERS(A0, A1) \ #define SWAP_QUARTERS(A0, A1) \
do { \ do { \
SWAP_HALVES(A0, A1); \ SWAP_HALVES(A0, A1); \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \ A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \ A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
} while((void)0, 0) } while((void)0, 0)
*/
/*
#define UNSWAP_QUARTERS(A0, A1) \ #define UNSWAP_QUARTERS(A0, A1) \
do { \ do { \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \ A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \ A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
SWAP_HALVES(A0, A1); \ SWAP_HALVES(A0, A1); \
} while((void)0, 0) } while((void)0, 0)
*/
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
do { \ do { \

View File

@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
mj[14] = mm256_rol_64( M[14], 15 ); mj[14] = mm256_rol_64( M[14], 15 );
mj[15] = mm256_rol_64( M[15], 16 ); mj[15] = mm256_rol_64( M[15], 16 );
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL ); __m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL ); static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K ); qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm256_add_epi64( K, Kincr ); K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
__m512i dH[16] ) __m512i dH[16] )
{ {
__m512i qt[32], xl, xh; __m512i qt[32], xl, xh;
__m512i mh[16]; __m512i mh[16], mj[16];
int i; int i;
for ( i = 0; i < 16; i++ ) for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] ); qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] ); qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
__m512i mj[16];
mj[ 0] = mm512_rol_64( M[ 0], 1 ); mj[ 0] = mm512_rol_64( M[ 0], 1 );
mj[ 1] = mm512_rol_64( M[ 1], 2 ); mj[ 1] = mm512_rol_64( M[ 1], 2 );
mj[ 2] = mm512_rol_64( M[ 2], 3 ); mj[ 2] = mm512_rol_64( M[ 2], 3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
mj[14] = mm512_rol_64( M[14], 15 ); mj[14] = mm512_rol_64( M[14], 15 );
mj[15] = mm512_rol_64( M[15], 16 ); mj[15] = mm512_rol_64( M[15], 16 );
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL ); __m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL ); static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K ); qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm512_add_epi64( K, Kincr ); K = _mm512_add_epi64( K, Kincr );

View File

@@ -503,32 +503,28 @@ do { \
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \ SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \ SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
s4 = mm512_swap64_32( s4 ); \ s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \ t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
sD = mm512_swap64_32( sD ); \ sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \ t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
L8( s0, t0, s9, t1 ); \ L8( s0, t0, s9, t1 ); \
s6 = mm512_swap64_32( s6 ); \ s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \ sF = mm512_swap64_32( sF ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \ t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
L8( s1, t2, sA, t3 ); \ L8( s1, t2, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \ s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \ sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
\ \
s7 = mm512_swap64_32( s7 ); \ t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
sC = mm512_swap64_32( sC ); \ t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
L8( s2, t4, sB, t5 ); \ L8( s2, t4, sB, t5 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \ s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \ sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
s6 = mm512_swap64_32( s6 ); \ s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \ sF = mm512_swap64_32( sF ); \
\ \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \ t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \ t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
L8( s3, t2, s8, t3 ); \ L8( s3, t2, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \ s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \ s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
s7 = mm512_swap64_32( s7 ); \ s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \ sC = mm512_swap64_32( sC ); \
\ \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \ t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \ t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \ t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
t3 = mm512_swap64_32( t3 ); \ t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \ L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \ s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \ s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \ s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \ s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \ s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \ sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \ s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \ sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
\ \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \ t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \ t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1268,7 +1263,7 @@ do { \
} while (0) } while (0)
#endif #endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions // v3, 15 instructions
#define SBOX( a, b, c, d ) \ #define SBOX( a, b, c, d ) \
{ \ { \
__m256i tb, td; \ __m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
#endif #endif
/* /*
/ v2, 16 instructions, 10 TL equivalent instructions / v2, 16 instructions
#define SBOX( a, b, c, d ) \ #define SBOX( a, b, c, d ) \
{ \ { \
__m256i t = mm256_xorand( d, a, c ); \ __m256i t = mm256_xorand( d, a, c ); \

View File

@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
__m512i t = a0; \ __m512i t = a0; \
a0 = mm512_xoror( a3, a0, a1 ); \ a0 = mm512_xoror( a3, a0, a1 ); \
a2 = _mm512_xor_si512( a2, a3 ); \ a2 = _mm512_xor_si512( a2, a3 ); \
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm512_xorand( a2, a3, t ); \ a3 = mm512_xorand( a2, a3, t ); \
a2 = mm512_xorand( a1, a2, a0); \ a2 = mm512_xorand( a1, a2, a0); \
a1 = _mm512_or_si512( a1, a3 ); \ a1 = _mm512_or_si512( a1, a3 ); \
a3 = _mm512_xor_si512( a3, a2 ); \ a3 = _mm512_xor_si512( a3, a2 ); \
t = _mm512_xor_si512( t, a1 ); \ t = _mm512_xor_si512( t, a1 ); \
a2 = _mm512_and_si512( a2, a1 ); \ a2 = _mm512_and_si512( a2, a1 ); \
a1 = mm512_xnor( a1, a0 ); \ a1 = mm512_nxor( a1, a0 ); \
a0 = t; \ a0 = t; \
} }
@@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
__m256i t = a0; \ __m256i t = a0; \
a0 = mm256_xoror( a3, a0, a1 ); \ a0 = mm256_xoror( a3, a0, a1 ); \
a2 = _mm256_xor_si256( a2, a3 ); \ a2 = _mm256_xor_si256( a2, a3 ); \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm256_xorand( a2, a3, t ); \ a3 = mm256_xorand( a2, a3, t ); \
a2 = mm256_xorand( a1, a2, a0); \ a2 = mm256_xorand( a1, a2, a0); \
a1 = _mm256_or_si256( a1, a3 ); \ a1 = _mm256_or_si256( a1, a3 ); \
a3 = _mm256_xor_si256( a3, a2 ); \ a3 = _mm256_xor_si256( a3, a2 ); \
t = _mm256_xor_si256( t, a1 ); \ t = _mm256_xor_si256( t, a1 ); \
a2 = _mm256_and_si256( a2, a1 ); \ a2 = _mm256_and_si256( a2, a1 ); \
a1 = mm256_xnor( a1, a0 ); \ a1 = mm256_nxor( a1, a0 ); \
a0 = t; \ a0 = t; \
} }

View File

@@ -69,18 +69,18 @@
v128_t t = a0; \ v128_t t = a0; \
a0 = v128_xoror( a3, a0, a1 ); \ a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \ a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
a3 = v128_xorand( a2, a3, t ); \ a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \ a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \ a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \ a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \ t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \ a2 = v128_and( a2, a1 ); \
a1 = v128_xnor( a1, a0 ); \ a1 = v128_nxor( a1, a0 ); \
a0 = t; \ a0 = t; \
} }
#else #elif defined(__ARM_NEON) || defined(__SSE2__)
#define SUBCRUMB( a0, a1, a2, a3 ) \ #define SUBCRUMB( a0, a1, a2, a3 ) \
{ \ { \

View File

@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
}; };
#if defined(__AVX2__) && defined(__SHA512__) #if defined(__AVX__) && defined(__SHA512__)
// SHA-512 implemented using SHA512 CPU extension. // SHA-512 implemented using SHA512 CPU extension.

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h" #include "simd-utils.h"
#include "sph_sha2.h" #include "sph_sha2.h"
#if defined(__SHA512__) && defined(__AVX2__) #if defined(__SHA512__) && defined(__AVX__)
// Experimental, untested // Experimental, untested
// Need to substitute for sph_sha512 // Need to substitute for sph_sha512

View File

@@ -305,7 +305,7 @@ do { \
xb0 = mm512_rol_32( xb0, 1 ); \ xb0 = mm512_rol_32( xb0, 1 ); \
xa0 = mm512_xor3( xm, xb1, \ xa0 = mm512_xor3( xm, xb1, \
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \ mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, xb0 ); \ xb0 = mm512_nxor( xa0, xb0 ); \
} while (0) } while (0)
#define PERM_STEP_0_16 do { \ #define PERM_STEP_0_16 do { \
@@ -898,7 +898,7 @@ do { \
xb0 = mm256_rol_32( xb0, 1 ); \ xb0 = mm256_rol_32( xb0, 1 ); \
xa0 = mm256_xor3( xm, xb1, \ xa0 = mm256_xor3( xm, xb1, \
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \ mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, xb0 ); \ xb0 = mm256_nxor( xa0, xb0 ); \
} while (0) } while (0)
#define PERM_STEP_0_8 do { \ #define PERM_STEP_0_8 do { \

View File

@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
{{ -30, 55, -58, -65, -95, -40, -98, 94 }}, {{ -30, 55, -58, -65, -95, -40, -98, 94 }},
}; };
#if defined(__AVX2__)
static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#define V128_00FF _mm256_castsi256_si128( V256_00FF )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#endif
#if defined(SIMD512)
static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V256_0101 _mm512_castsi512_si256( V512_0101 )
#define V128_0101 _mm512_castsi512_si128( V512_0101 )
static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V256_0080 _mm512_castsi512_si256( V512_0080 )
#define V128_0080 _mm512_castsi512_si128( V512_0080 )
#elif defined(__AVX2__)
static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V128_0101 _mm256_castsi256_si128( V256_0101 )
static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V128_0080 _mm256_castsi256_si128( V256_0080 )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
#endif
#if defined(__x86_64__) #if defined(__x86_64__)
#define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1) #define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
#define shufxor(x,s) XCAT(SHUFXOR_,s)(x) #define shufxor(x,s) XCAT(SHUFXOR_,s)(x)
#define REDUCE(x) \ #define REDUCE(x) \
v128_sub16( v128_and( x, v128_64( \ v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )
0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\ #define EXTRA_REDUCE_S(x)\
v128_sub16( x, v128_and( \ v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )
v128_64( 0x0101010101010101 ), \
v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) #define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -293,10 +337,9 @@ do { \
// This will make the full FFT_64 in order. // This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \ #define INTERLEAVE(i,j) \
do { \ do { \
v128u16_t t1= X(i); \ v128u16_t t = X(i); \
v128u16_t t2= X(j); \ X(i) = v128_unpacklo16( t, X(j) ); \
X(i) = v128_unpacklo16( t1, t2 ); \ X(j) = v128_unpackhi16( t, X(j) ); \
X(j) = v128_unpackhi16( t1, t2 ); \
} while(0) } while(0)
INTERLEAVE( 1, 0 ); INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =
#define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x) #define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x)
#if defined(VL256)
#define REDUCE(x) \ #define REDUCE(x) \
_mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \ _mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )
_mm256_srai_epi16( x, 8 ) )
#else
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
#endif
#define EXTRA_REDUCE_S(x)\ #define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \ _mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
_mm256_set1_epi64x( 0x0101010101010101 ), \ _mm256_cmpgt_epi16( x, V256_0080 ) ) )
_mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) #define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -917,10 +949,9 @@ do { \
// This will make the full FFT_64 in order. // This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \ #define INTERLEAVE(i,j) \
do { \ do { \
__m256i t1= X(i); \ __m256i t = X(i); \
__m256i t2= X(j); \ X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \ X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
} while(0) } while(0)
INTERLEAVE( 1, 0 ); INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
_mm512_srai_epi16( x, 8 ) ) _mm512_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S4w(x) \ #define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \ _mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
_mm512_set1_epi64( 0x0101010101010101 ), \ _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
// generic, except it calls targetted macros // generic, except it calls targetted macros
#define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) ) #define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) )

View File

@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
#if defined(__AVX2__) #if defined(__AVX2__)
__m256i F0, F1, F2, F3, F4, F5, F6, F7; __m256i F0, F1, F2, F3, F4, F5, F6, F7;
__m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] ); __m256i *table = (__m256i*)fftTable;
__m256i tbl = table[ input[0] ];
__m256i *mul = (__m256i*)multipliers; __m256i *mul = (__m256i*)multipliers;
__m256i *out = (__m256i*)output; __m256i *out = (__m256i*)output;
F0 = _mm256_mullo_epi32( mul[0], tbl ); F0 = _mm256_mullo_epi32( mul[0], tbl );
tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] ); tbl = table[ input[1] ];
F1 = _mm256_mullo_epi32( mul[1], tbl ); F1 = _mm256_mullo_epi32( mul[1], tbl );
tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] ); tbl = table[ input[2] ];
F2 = _mm256_mullo_epi32( mul[2], tbl ); F2 = _mm256_mullo_epi32( mul[2], tbl );
tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] ); tbl = table[ input[3] ];
F3 = _mm256_mullo_epi32( mul[3], tbl ); F3 = _mm256_mullo_epi32( mul[3], tbl );
tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] ); tbl = table[ input[4] ];
F4 = _mm256_mullo_epi32( mul[4], tbl ); F4 = _mm256_mullo_epi32( mul[4], tbl );
tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] ); tbl = table[ input[5] ];
F5 = _mm256_mullo_epi32( mul[5], tbl ); F5 = _mm256_mullo_epi32( mul[5], tbl );
tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] ); tbl = table[ input[6] ];
F6 = _mm256_mullo_epi32( mul[6], tbl ); F6 = _mm256_mullo_epi32( mul[6], tbl );
tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] ); tbl = table[ input[7] ];
F7 = _mm256_mullo_epi32( mul[7], tbl ); F7 = _mm256_mullo_epi32( mul[7], tbl );
#define ADD_SUB( a, b ) \ #define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
ADD_SUB( F1, F3 ); ADD_SUB( F1, F3 );
ADD_SUB( F4, F6 ); ADD_SUB( F4, F6 );
ADD_SUB( F5, F7 ); ADD_SUB( F5, F7 );
F5 = _mm256_slli_epi32( F5, 2 );
F6 = _mm256_slli_epi32( F6, 4 ); F6 = _mm256_slli_epi32( F6, 4 );
F7 = _mm256_slli_epi32( F7, 6 ); F7 = _mm256_slli_epi32( F7, 6 );
F5 = _mm256_slli_epi32( F5, 2 );
ADD_SUB( F0, F4 ); ADD_SUB( F0, F4 );
ADD_SUB( F1, F5 ); ADD_SUB( F1, F5 );
ADD_SUB( F2, F6 ); ADD_SUB( F2, F6 );

View File

@@ -4,11 +4,11 @@
# during development. However, the information contained may provide compilation # during development. However, the information contained may provide compilation
# tips to users. # tips to users.
rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null
# armv9 needs gcc-13 # armv9 needs gcc-13
# -march-armv9-a includes SVE2 but no crypto # -march-armv9-a includes SVE2 but no crypto
# -march=armv9-a+crypto adds AES & SHA2 but not SHA512 # -march=armv9-a+crypto adds AES & SHA256 but not SHA512
make distclean || echo clean make distclean || echo clean
rm -f config.status rm -f config.status
@@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-c
make -j $(nproc) make -j $(nproc)
mv cpuminer cpuminer-armv9 mv cpuminer cpuminer-armv9
# Apple M4: armv9.2, AES, SHA3, SVE2
make clean || echo clean
CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m4
# Apple M2: armv8.6, AES, SHA3
make clean || echo clean
CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m2
# SVE2 available in armv8.5 # SVE2 available in armv8.5
make clean || echo clean make clean || echo clean
CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2 mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2
# SHA3 available in armv8.4 # Apple M1: armv8.4 AES, SHA3
make clean || echo clean make clean || echo clean
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
mv cpuminer cpuminer-armv8.4-crypto-sha3 mv cpuminer cpuminer-armv8.4-crypto-sha3
# Cortex-A76 (Orange Pi 5)
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8-crypto
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl

View File

@@ -34,9 +34,7 @@ make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-arrowlake-s mv cpuminer cpuminer-arrowlake-s
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14 # Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14
# Granitrapids does not build with AVX10, SHA512 or APX.
# wait for Diamondrapids & gcc-15.
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
@@ -44,13 +42,13 @@ make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-graniterapids mv cpuminer cpuminer-graniterapids
# SHA512 AVX10.1 # Graniterapids + SHA512, AVX10.1
#make clean || echo clean make clean || echo clean
#rm -f config.status rm -f config.status
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
#make -j $(nproc) make -j $(nproc)
#strip -s cpuminer strip -s cpuminer
#mv cpuminer cpuminer-avx10_1 mv cpuminer cpuminer-avx10.1
# SHA512 AVX10.2 # SHA512 AVX10.2
#make clean || echo clean #make clean || echo clean
@@ -72,6 +70,9 @@ mv cpuminer cpuminer-graniterapids
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
# zen4 is close enough for older compiler
#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-zen5 mv cpuminer cpuminer-zen5
@@ -138,13 +139,21 @@ make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-avx mv cpuminer cpuminer-avx
# SSE4.2 AES: Intel Westmere, most Pentium & Celeron # SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-sse42-aes-sha
# SSE4.2 AES: Intel Westmere, older Pentium & Celeron
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
make -j $(nproc) make -j $(nproc)
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-aes-sse42 mv cpuminer cpuminer-sse42-aes
# SSE4.2: Intel Nehalem # SSE4.2: Intel Nehalem
make clean || echo clean make clean || echo clean

View File

@@ -2,8 +2,8 @@
# #
# make clean and rm all the targetted executables. # make clean and rm all the targetted executables.
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null
make distclean > /dev/null make distclean > /dev/null

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6. # Generated by GNU Autoconf 2.71 for cpuminer-opt 25.7.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.6' PACKAGE_VERSION='25.7'
PACKAGE_STRING='cpuminer-opt 25.6' PACKAGE_STRING='cpuminer-opt 25.7'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1431,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.6:";; short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1536,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 25.6 cpuminer-opt configure 25.7
generated by GNU Autoconf 2.71 generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc. Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.6, which was It was created by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='25.6' VERSION='25.7'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 25.6, which was This file was extended by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 25.6 cpuminer-opt config.status 25.7
configured by $0, generated by GNU Autoconf 2.71, configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [25.6]) AC_INIT([cpuminer-opt], [25.7])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6. # Generated by GNU Autoconf 2.72 for cpuminer-opt 25.7.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.6' PACKAGE_VERSION='25.7'
PACKAGE_STRING='cpuminer-opt 25.6' PACKAGE_STRING='cpuminer-opt 25.7'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems. 'configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1424,7 +1424,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.6:";; short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1528,7 +1528,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 25.6 cpuminer-opt configure 25.7
generated by GNU Autoconf 2.72 generated by GNU Autoconf 2.72
Copyright (C) 2023 Free Software Foundation, Inc. Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.6, which was It was created by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.72. Invocation command line was generated by GNU Autoconf 2.72. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@@ -3768,7 +3768,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='25.6' VERSION='25.7'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7581,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 25.6, which was This file was extended by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.72. Invocation command line was generated by GNU Autoconf 2.72. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -7649,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 25.6 cpuminer-opt config.status 25.7
configured by $0, generated by GNU Autoconf 2.72, configured by $0, generated by GNU Autoconf 2.72,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -921,39 +921,32 @@ out:
return rc; return rc;
} }
// returns the unit prefix and the hashrate appropriately scaled. // Does not account for leap years.
void scale_hash_for_display ( double* hashrate, char* prefix )
{
if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
else { *prefix = 'Y'; *hashrate /= 1e24; }
}
static inline void sprintf_et( char *str, long unsigned int seconds ) static inline void sprintf_et( char *str, long unsigned int seconds )
{ {
long unsigned int min = seconds / 60; long unsigned int minutes = seconds / 60;
long unsigned int sec = seconds % 60; if ( minutes )
long unsigned int hrs = min / 60; {
long unsigned int hours = minutes / 60;
if ( unlikely( hrs ) ) if ( hours )
{
long unsigned int days = hours / 24;
if ( days )
{ {
long unsigned int days = hrs / 24;
long unsigned int years = days / 365; long unsigned int years = days / 365;
if ( years ) // 0y000d if ( years )
sprintf( str, "%luy%lud", years, years % 365 ); sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d
else if ( days ) // 0d00h else
sprintf( str, "%lud%02luh", days, hrs % 24 ); sprintf( str, "%lud%02luh", days, hours % 24 ); // 0d00h
else // 0h00m
sprintf( str, "%luh%02lum", hrs, min % 60 );
} }
else // 0m00s else
sprintf( str, "%lum%02lus", min, sec ); sprintf( str, "%luh%02lum", hours, minutes % 60 ); // 0h00m
}
else
sprintf( str, "%lum%02lus", minutes, seconds % 60 ); // 0m00s
}
else
sprintf( str, "%lus", seconds ); // 0s
} }
const long double exp32 = EXP32; // 2**32 const long double exp32 = EXP32; // 2**32
@@ -2833,67 +2826,29 @@ static void show_credits()
static bool cpu_capability( bool display_only ) static bool cpu_capability( bool display_only )
{ {
char cpu_brand[0x40]; char cpu_brand[0x40];
bool cpu_has_sse2 = has_sse2(); // X86_64 only
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
bool cpu_has_sse41 = has_sse41(); // X86_64 only
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx();
bool cpu_has_neon = has_neon(); // AArch64
bool cpu_has_sve = has_sve(); // aarch64 only, insignificant
bool cpu_has_sve2 = has_sve2(); // AArch64 only
bool cpu_has_sme = has_sme();
bool cpu_has_sme2 = has_sme2();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_avx10 = has_avx10();
bool cpu_has_aes = has_aes(); // x86_64 or AArch64
bool cpu_has_vaes = has_vaes(); // X86_64 only
bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64
bool cpu_has_sha512 = has_sha512();
bool sw_has_x86_64 = false; bool sw_has_x86_64 = false;
bool sw_has_aarch64 = false; bool sw_has_aarch64 = false;
int sw_arm_arch = 0; // AArch64 version int sw_arm_arch = 0; // AArch64 version
bool sw_has_neon = false; // AArch64 bool sw_has_neon = false; // AArch64
bool sw_has_sve = false; // AArch64 bool sw_has_sve = false;
bool sw_has_sve2 = false; // AArch64 bool sw_has_sve2 = false;
bool sw_has_sme = false; bool sw_has_sme = false;
bool sw_has_sme2 = false; bool sw_has_sme2 = false;
bool sw_has_sse2 = false; // x86_64 bool sw_has_sse2 = false; // x86_64
bool sw_has_ssse3 = false; // x86_64 bool sw_has_ssse3 = false;
bool sw_has_sse41 = false; // x86_64 bool sw_has_sse41 = false;
bool sw_has_sse42 = false; bool sw_has_sse42 = false;
bool sw_has_avx = false; bool sw_has_avx = false;
bool sw_has_avx2 = false; bool sw_has_avx2 = false;
bool sw_has_avx512 = false; bool sw_has_avx512 = false;
bool sw_has_avx10 = false; bool sw_has_avx10 = false;
bool sw_has_aes = false; bool sw_has_amx = false;
bool sw_has_vaes = false; bool sw_has_apx = false;
bool sw_has_aes = false; // x86_64 or AArch64
bool sw_has_vaes = false; // x86_64
bool sw_has_sha256 = false; // x86_64 or AArch64 bool sw_has_sha256 = false; // x86_64 or AArch64
bool sw_has_sha512 = false; // x86_64 or AArch64 bool sw_has_sha512 = false;
/*
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features );
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
bool algo_has_neon = set_incl( NEON_OPT, algo_features );
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_aes;
bool use_vaes;
bool use_sha256;
bool use_sha512;
bool use_neon;
bool use_none;
*/
#if defined(__x86_64__) #if defined(__x86_64__)
sw_has_x86_64 = true; sw_has_x86_64 = true;
#elif defined(__aarch64__) #elif defined(__aarch64__)
@@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only )
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
sw_has_avx512 = true; sw_has_avx512 = true;
#endif #endif
// AVX10 version is not significant as of AVX10.2. If that changes use a better #if defined(__AVX10_1__) // version is not significant
// way to test the version than sequentially.
// #if defined(__AVX10_2__)
//
// #elif defined(__AVX10_1__)
#if defined(__AVX10_1__)
sw_has_avx10 = true; sw_has_avx10 = true;
#endif #endif
#ifdef __AMX_TILE__
sw_has_amx = true;
#endif
#ifdef __APX_F__
sw_has_apx = true;
#endif
// x86_64 or AArch64 // x86_64 or AArch64
#if defined(__AES__) || defined(__ARM_FEATURE_AES) #if defined(__AES__) || defined(__ARM_FEATURE_AES)
@@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only )
#if defined(__ARM_NEON) #if defined(__ARM_NEON)
sw_has_neon = true; sw_has_neon = true;
#endif #endif
// FYI, SVE & SME not used by cpuminer
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE)
sw_has_sve = true; sw_has_sve = true;
#endif #endif
@@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only )
// Build // Build
printf( "SW built on " __DATE__ printf( "SW built on " __DATE__
#if defined(__clang__) #if defined(__clang__)
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, " with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ );
__clang_patchlevel__ );
#elif defined(__GNUC__) #elif defined(__GNUC__)
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ ); " with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
#endif #endif
@@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only )
printf("CPU features: "); printf("CPU features: ");
if ( cpu_arch_x86_64() ) if ( cpu_arch_x86_64() )
{ {
if ( cpu_has_avx10 ) printf( " AVX10.%d", avx10_version() ); if ( has_avx10() ) printf( " AVX10.%d", avx10_version() );
if ( cpu_has_avx512 ) printf( " AVX512" ); else if ( has_avx512() ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " ); else if ( has_avx2() ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " ); else if ( has_avx() ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" ); else if ( has_sse42() ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" ); else if ( has_sse41() ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " ); else if ( has_ssse3() ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " ); else if ( has_sse2() ) printf( " SSE2 " );
if ( has_amx() ) printf( " AMX" );
if ( has_apx_f() ) printf( " APX" );
} }
else if ( cpu_arch_aarch64() ) else if ( cpu_arch_aarch64() )
{ {
if ( cpu_has_neon ) printf( " NEON" ); if ( has_neon() ) printf( " NEON" );
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() ); if ( has_sve2() ) printf( " SVE2-%d", sve_vector_length() );
else if ( cpu_has_sve ) printf( " SVE" ); else if ( has_sve() ) printf( " SVE" );
if ( cpu_has_sme2 ) printf( " SME2" ); if ( has_sme2() ) printf( " SME2" );
else if ( cpu_has_sme ) printf( " SME" ); else if ( has_sme() ) printf( " SME" );
} }
if ( cpu_has_vaes ) printf( " VAES" ); if ( has_vaes() ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" ); else if ( has_aes() ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" ); if ( has_sha512() ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" ); else if ( has_sha256() ) printf( " SHA256" );
printf("\nSW features: "); printf("\nSW features: ");
if ( sw_has_x86_64 ) if ( sw_has_x86_64 )
@@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_sse41 ) printf( " SSE4.1" ); else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " ); else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " ); else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_amx ) printf( " AMX" );
if ( sw_has_apx ) printf( " APX" );
} }
else if ( sw_has_aarch64 ) else if ( sw_has_aarch64 )
{ {

View File

@@ -542,7 +542,9 @@ void applog_hash(void *hash);
void format_hashrate(double hashrate, char *output); void format_hashrate(double hashrate, char *output);
void print_hash_tests(void); void print_hash_tests(void);
// Factors of 1000 used for hashes, ie kH/s, Mh/s.
void scale_hash_for_display ( double* hashrate, char* units ); void scale_hash_for_display ( double* hashrate, char* units );
// Factors of 1024 used for bytes, ie kiB, MiB.
void format_number_si( double* hashrate, char* si_units ); void format_number_si( double* hashrate, char* si_units );
void report_summary_log( bool force ); void report_summary_log( bool force );

View File

@@ -447,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 ) #define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b // ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 ) #define v128_nxor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else #else
@@ -469,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) ) #define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) ) #define v128_nxor( a, b ) v128_not( _mm_xor_si128( a, b ) )
#endif #endif

View File

@@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 ) #define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b // ~( a ^ b ), same as (~a) ^ b
#define mm256_xnor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 ) #define mm256_nxor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 )
#else #else
@@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) \ #define mm256_orand( a, b, c ) \
_mm256_or_si256( a, _mm256_and_si256( b, c ) ) _mm256_or_si256( a, _mm256_and_si256( b, c ) )
#define mm256_xnor( a, b ) \ #define mm256_nxor( a, b ) \
mm256_not( _mm256_xor_si256( a, b ) ) mm256_not( _mm256_xor_si256( a, b ) )
#endif #endif

View File

@@ -18,8 +18,13 @@
// AVX512 intrinsics have a few changes from previous conventions. // AVX512 intrinsics have a few changes from previous conventions.
// //
// "_mm512_cmp" instructions now returns a bitmask instead of a vector mask. // "_mm512_cmp" instructions now return a bitmask instead of a vector mask.
// This removes the need for an explicit movemask instruction. // This removes the need for an explicit movemask instruction. It is also
// slower than AVX2 cmp. There is no version of AVX512 cmp that returns a
// vector result resulting in a double penalty if a vector result is needed:
// slower cmp instruction & extra instruction to convert bit mask into
// vector mask. 256 bit & 128 bit still have legacy cmp returning vector
// while AVX512VL adds masked versions returning bit mask.
// //
// Many previously sizeless (si) instructions now have sized (epi) versions // Many previously sizeless (si) instructions now have sized (epi) versions
// to accomodate masking packed elements. // to accomodate masking packed elements.
@@ -30,7 +35,7 @@
// list. // list.
// //
// "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other // "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
// AVX512 permutes can cross all lanes. // AVX512 instructions using the permute name can cross all lanes.
// //
// New alignr instructions for epi64 and epi32 operate across the entire // New alignr instructions for epi64 and epi32 operate across the entire
// vector but slower than epi8 which continues to be restricted to 128 bit // vector but slower than epi8 which continues to be restricted to 128 bit
@@ -50,16 +55,17 @@
// parentheses to ensure the expression argument is evaluated first. // parentheses to ensure the expression argument is evaluated first.
// - if an argument is to referenced multiple times a C inline function // - if an argument is to referenced multiple times a C inline function
// should be used instead of a macro to prevent an expression argument // should be used instead of a macro to prevent an expression argument
// from being evaluated multiple times (wasteful) or produces side // from being evaluated multiple times (wasteful) or produce side
// effects (very bad). // effects (very bad).
// //
// There are 2 areas where overhead is a major concern: constants and // There are 2 areas where overhead is a major concern: constants and
// permutations. // permutations.
// //
// Constants need to be composed at run time by assembling individual // Constants need to be composed at run time by assembling individual
// elements, very expensive. The cost is proportional to the number of // elements or loaded from memory, very expensive. The cost of runtime
// different elements therefore use the largest element size possible, // construction is proportional to the number of different elements
// merge smaller integer elements to 64 bits, and group repeated elements. // therefore use the largest element size possible merging smaller integer
// elements to 64 bits, and group repeated elements.
// //
// Constants with repeating patterns can be optimized with the smaller // Constants with repeating patterns can be optimized with the smaller
// patterns repeated more frequently being more efficient. // patterns repeated more frequently being more efficient.
@@ -67,14 +73,15 @@
// Some specific constants can be very efficient. Zero is very efficient, // Some specific constants can be very efficient. Zero is very efficient,
// 1 and -1 slightly less so. // 1 and -1 slightly less so.
// //
// If an expensive constant is to be reused in the same function it should // If an expensive constant is to be reused in the same function it may
// be declared as a local variable defined once and reused. // be declared as a local variable defined once and reused. If frequently
// used it can be declared as a static const in memory.
// //
// Permutations can be very expensive if they use a vector control index, // Permutations can be very expensive if they use a vector control index,
// even if the permutation itself is quite efficient. // even if the permute instruction itself is quite efficient.
// The index is essentially a constant with all the baggage that brings. // The index is essentially a vector constant with all the baggage that
// The same rules apply, if an index is to be reused it should be defined // brings. The same rules apply, if an index is to be reused it should either
// as a local. This applies specifically to bswap operations. // be defined as a local or static const.
// //
// Permutations that cross 128 bit lanes are typically slower and often need // Permutations that cross 128 bit lanes are typically slower and often need
// a vector control index. If the permutation doesn't need to cross 128 bit // a vector control index. If the permutation doesn't need to cross 128 bit
@@ -221,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 ) #define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 )
// ~( a ^ b ), (~a) ^ b // ~( a ^ b ), (~a) ^ b
#define mm512_xnor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 ) #define mm512_nxor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 )
// ~( a & b ) // ~( a & b )
#define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef ) #define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef )
@@ -241,6 +248,15 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_ror_32 _mm512_ror_epi32 #define mm512_ror_32 _mm512_ror_epi32
#define mm512_rol_32 _mm512_rol_epi32 #define mm512_rol_32 _mm512_rol_epi32
/* not used
#if defined(__AVX512VBMI2__)
#define mm512_ror_16( v, c ) _mm512_shrdi_epi16( c, v, v )
#define mm512_rol_16( v, c ) _mm512_shldi_epi16( c, v, v )
#endif
*/
// //
// Reverse byte order of packed elements, vectorized endian conversion. // Reverse byte order of packed elements, vectorized endian conversion.
@@ -249,9 +265,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 ) #define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 )
/* not used /* not used
#if defined(__AVX512VBMI2__)
#define mm512_bswap_16( v ) mm512_ror_16( v, 8 )
#else
#define mm512_bswap_16( v ) \ #define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \ _mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) ) 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
#endif
*/ */
#define mm512_bswap_16( v ) \ #define mm512_bswap_16( v ) \
@@ -431,8 +455,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
_mm512_castsi512_ps( v2 ), c ) ); _mm512_castsi512_ps( v2 ), c ) );
// 64 bit lanes // 64 bit lanes
// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE. // Redundant with ror & rol but included for consistency with AVX2/SSE.
#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 ) #define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_swap64_32 mm512_qrev32 // grandfathered #define mm512_swap64_32 mm512_qrev32 // grandfathered

View File

@@ -126,7 +126,7 @@
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 ) #define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
// ~( v1 ^ v0 ), same as (~v1) ^ v0 // ~( v1 ^ v0 ), same as (~v1) ^ v0
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) ) #define v128_nxor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, args reversed for consistency with x86_64 // ~v1 | v0, args reversed for consistency with x86_64
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 ) #define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
@@ -349,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) ) vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Bit rotation // Bit rotation
/*
#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 )
#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 )
#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuflr64_16( v ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
: ( (c) == 48 ) ? v128_shufll64_16( v ) \
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shufll64_16( v ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
: ( (c) == 48 ) ? v128_shuflr64_16( v ) \
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
*/
#define v128_ror64( v, c ) \ #define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \ ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \

View File

@@ -474,6 +474,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
} }
/* /*
// ARM feature compiler flags
#ifdef __aarch64__ #ifdef __aarch64__
#warning "__aarch64__" #warning "__aarch64__"
#endif #endif
@@ -509,16 +510,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
#endif #endif
*/ */
// Typical display format: AVX10.[version]_[vectorlength], if vector length is
// omitted 256 is the default.
// Ex: AVX10.1_512
// Flags:
// AVX10 128 256 512
// 0 0 0 0 = AVX10 not supported
// 1 1 1 0 = AVX10 256 bit max (version 2)
// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids)
// Other combinations are not defined.
static inline bool cpu_arch_x86_64() static inline bool cpu_arch_x86_64()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
@@ -766,6 +757,17 @@ static inline bool has_vbmi2()
#endif #endif
} }
static inline bool has_amx()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EDX_Reg ] & AMX_TILE_Flag;
#else
return false;
#endif
}
static inline bool has_aes() static inline bool has_aes()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
@@ -815,12 +817,9 @@ static inline bool has_sveaes()
static inline bool has_sha256() static inline bool has_sha256()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 }; unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info ); cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & SHA_Flag; return cpu_info[ EBX_Reg ] & SHA_Flag;
}
return false; return false;
#elif defined(__aarch64__) && defined(HWCAP_SHA2) #elif defined(__aarch64__) && defined(HWCAP_SHA2)
// NEON SHA256 // NEON SHA256
@@ -835,7 +834,7 @@ static inline bool has_sha256()
static inline bool has_sha512() static inline bool has_sha512()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
if ( has_avx2() ) if ( has_avx() )
{ {
unsigned int cpu_info[4] = { 0 }; unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info ); cpuid( EXTENDED_FEATURES, 1, cpu_info );
@@ -852,7 +851,6 @@ static inline bool has_sha512()
#endif #endif
} }
// Arm only
static inline bool has_sha3() static inline bool has_sha3()
{ {
#if defined(__aarch64__) && defined(HWCAP_SHA3) #if defined(__aarch64__) && defined(HWCAP_SHA3)
@@ -944,16 +942,6 @@ static inline int sve_vector_length()
return 0; return 0;
} }
// Assume min_vlen refers to the register size
static inline int rvv_vector_length()
{
#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return __riscv_v_min_vlen;
#endif
return 0;
}
// generic
static inline int vector_length() static inline int vector_length()
{ {
#if defined(__x86_64__) #if defined(__x86_64__)
@@ -965,8 +953,8 @@ static inline int vector_length()
return has_sve() ? sve_vector_length() return has_sve() ? sve_vector_length()
: has_neon() ? 128 : has_neon() ? 128
: 0; : 0;
#elif defined(__riscv) && defined(__riscv_vector) #elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return rvv_vector_length(); return __riscv_v_min_vlen;
#endif #endif
return 0; return 0;
} }

43
util.c
View File

@@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
free(cmd); free(cmd);
} }
// Decimal SI, factors 0f 1000
void scale_hash_for_display ( double* hashrate, char* prefix )
{
if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
else { *prefix = 'Y'; *hashrate /= 1e24; }
}
void format_hashrate( double hashrate, char *output ) void format_hashrate( double hashrate, char *output )
{ {
char prefix = '\0'; char prefix = '\0';
scale_hash_for_display( &hashrate, &prefix );
if (hashrate < 10000) { sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix );
// nop
}
else if (hashrate < 1e7) {
prefix = 'k';
hashrate *= 1e-3;
}
else if (hashrate < 1e10) {
prefix = 'M';
hashrate *= 1e-6;
}
else if (hashrate < 1e13) {
prefix = 'G';
hashrate *= 1e-9;
}
else {
prefix = 'T';
hashrate *= 1e-12;
} }
sprintf( // Binary SI, factors of 1024
output,
prefix ? "%.2f %cH/s" : "%.2f H/s%c",
hashrate, prefix
);
}
// For use with MiB etc
void format_number_si( double* n, char* si_units ) void format_number_si( double* n, char* si_units )
{ {
if ( *n < 1024*10 ) { *si_units = 0; return; } if ( *n < 1024*10 ) { *si_units = 0; return; }

View File

@@ -55,9 +55,9 @@ the current directory it will be created.
Data file creation can take up to 30 minutes on a spinning hard drive. Data file creation can take up to 30 minutes on a spinning hard drive.
Once created the new data file will be verified and used immediately Once created the new data file will be verified and used immediately
if a valid url and user were included on the command line. if a valid url and user was included on the command line.
A default data file can be created by ommitting the url option. That will A default data file can also be created by ommitting the url option. That will
either verify an existing default data file or create one and verify it, either verify an existing default data file or create one and verify it,
then exit. then exit.