Compare commits

..

2 Commits
v25.6 ... v26.1

Author SHA1 Message Date
Jay D Dee
b34565bfac v26.1 2026-01-13 19:17:47 -05:00
Jay D Dee
8f2f9ec3e9 v25.7 2025-11-15 10:44:32 -05:00
27 changed files with 569 additions and 488 deletions

View File

@@ -32,8 +32,6 @@ Requirements
32 bit CPUs are not supported.
Older CPUs are supported by open source cpuminer-multi by TPruvot but at reduced performance.
Mining on mobile devices that meet the requirements is not recommended due to the risk of
overheating and damaging the battery. Mining has unlimited demand, it will push any device
to or beyond its limits. There is also a fire risk with overheated lithium batteries.
@@ -75,6 +73,18 @@ If not what makes it happen or not happen?
Change Log
----------
v26.1
Fixed segfault in scrypt algo on some older CPUs.
v25.7
Fixed a bug calculating TTF longer than 1 year.
Faster argon2d.
Faster hamsi AVX512.
Faster switfftx AVX2.
Other small fixes and improvements.
v25.6
Added argon2d1000, argon2d16000 algos.

View File

@@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#if defined(__SSSE3__) || defined(__ARM_NEON)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0 = v128_alignr8(B1, B0, 8); \
v128_t t1 = v128_alignr8(B0, B1, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = v128_alignr8(D1, D0, 8); \
t1 = v128_alignr8(D0, D1, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = v128_alignr8( B1, B0, 8 ); \
B1 = v128_alignr8( B0, B1, 8 ); \
B0 = t; \
t = v128_alignr8( D1, D0, 8 ); \
D0 = v128_alignr8( D0, D1, 8 ); \
D1 = t; \
}
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0 = v128_alignr8(B0, B1, 8); \
v128_t t1 = v128_alignr8(B1, B0, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = v128_alignr8(D0, D1, 8); \
t1 = v128_alignr8(D1, D0, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = v128_alignr8( B0, B1, 8 ); \
B1 = v128_alignr8( B1, B0, 8 ); \
B0 = t; \
t = v128_alignr8( D0, D1, 8 ); \
D0 = v128_alignr8( D1, D0, 8 ); \
D1 = t; \
}
#else /* SSE2 */
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0 = D0; \
v128_t t1 = B0; \
D0 = C0; \
C0 = C1; \
C1 = D0; \
D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \
D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \
B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1)); \
B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \
} while ((void)0, 0)
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = D0; \
D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
t = B0; \
B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
}
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = B0; \
B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
t = D0; \
D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
}
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0, t1; \
t0 = C0; \
C0 = C1; \
C1 = t0; \
t0 = B0; \
t1 = D0; \
B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0)); \
B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \
D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1)); \
D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \
} while ((void)0, 0)
#endif
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0)
#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
{ \
G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
}
#else /* __AVX2__ */
#include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
tmp1 = C0; \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
C1 = tmp1; \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
} while(0);
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
tmp1 = C0; \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
} while((void)0, 0);
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
} while((void)0, 0);
@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
} while((void)0, 0);
@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#include <immintrin.h>
/*
static inline __m512i muladd(__m512i x, __m512i y)
{
__m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
}
*/
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 32); \
D1 = _mm512_ror_epi64(D1, 32); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
}
#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 16); \
D1 = _mm512_ror_epi64(D1, 16); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
}
/*
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0)
*/
/*
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0)
*/
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
} while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
} while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0)
static const __m512i swap_q0 = { 0,1, 8,9, 2,3, 10,11 };
static const __m512i swap_q1 = { 4,5, 12,13, 6,7, 14,15 };
static const __m512i uswap_q0 = { 0,1, 4,5, 8,9, 12,13 };
static const __m512i uswap_q1 = { 2,3, 6,7, 10,11, 14,15 };
#define SWAP_HALVES(A0, A1) \
do { \
__m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
A0 = t; \
} while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
A0 = t; \
}
#define UNSWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
A0 = t; \
}
/*
#define SWAP_QUARTERS(A0, A1) \
do { \
SWAP_HALVES(A0, A1); \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
} while((void)0, 0)
*/
/*
#define UNSWAP_QUARTERS(A0, A1) \
do { \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
SWAP_HALVES(A0, A1); \
} while((void)0, 0)
*/
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
do { \

View File

@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
mj[14] = mm256_rol_64( M[14], 15 );
mj[15] = mm256_rol_64( M[15], 16 );
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
__m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
__m512i dH[16] )
{
__m512i qt[32], xl, xh;
__m512i mh[16];
__m512i mh[16], mj[16];
int i;
for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
__m512i mj[16];
mj[ 0] = mm512_rol_64( M[ 0], 1 );
mj[ 1] = mm512_rol_64( M[ 1], 2 );
mj[ 2] = mm512_rol_64( M[ 2], 3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
mj[14] = mm512_rol_64( M[14], 15 );
mj[15] = mm512_rol_64( M[15], 16 );
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
__m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm512_add_epi64( K, Kincr );

View File

@@ -503,32 +503,28 @@ do { \
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
L8( s0, t0, s9, t1 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
L8( s1, t2, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
\
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
L8( s2, t4, sB, t5 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
\
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
L8( s3, t2, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1142,7 +1137,7 @@ do { \
} \
} while (0)
// v3 ternary logic, 8 instructions, 2 local vars
// v4 ternary logic, 8 instructions, 2 local vars
#define SBOX( a, b, c, d ) \
{ \
__m256i tb, td; \
@@ -1268,7 +1263,7 @@ do { \
} while (0)
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
// v3, 15 instructions
#define SBOX( a, b, c, d ) \
{ \
__m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
#endif
/*
/ v2, 16 instructions, 10 TL equivalent instructions
/ v2, 16 instructions
#define SBOX( a, b, c, d ) \
{ \
__m256i t = mm256_xorand( d, a, c ); \

View File

@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
__m512i t = a0; \
a0 = mm512_xoror( a3, a0, a1 ); \
a2 = _mm512_xor_si512( a2, a3 ); \
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm512_xorand( a2, a3, t ); \
a2 = mm512_xorand( a1, a2, a0); \
a1 = _mm512_or_si512( a1, a3 ); \
a3 = _mm512_xor_si512( a3, a2 ); \
t = _mm512_xor_si512( t, a1 ); \
a2 = _mm512_and_si512( a2, a1 ); \
a1 = mm512_xnor( a1, a0 ); \
a1 = mm512_nxor( a1, a0 ); \
a0 = t; \
}
@@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
__m256i t = a0; \
a0 = mm256_xoror( a3, a0, a1 ); \
a2 = _mm256_xor_si256( a2, a3 ); \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm256_xorand( a2, a3, t ); \
a2 = mm256_xorand( a1, a2, a0); \
a1 = _mm256_or_si256( a1, a3 ); \
a3 = _mm256_xor_si256( a3, a2 ); \
t = _mm256_xor_si256( t, a1 ); \
a2 = _mm256_and_si256( a2, a1 ); \
a1 = mm256_xnor( a1, a0 ); \
a1 = mm256_nxor( a1, a0 ); \
a0 = t; \
}

View File

@@ -69,18 +69,18 @@
v128_t t = a0; \
a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \
a1 = v128_xnor( a1, a0 ); \
a1 = v128_nxor( a1, a0 ); \
a0 = t; \
}
#else
#elif defined(__ARM_NEON) || defined(__SSE2__)
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \

View File

@@ -37,8 +37,8 @@
#if defined(SIMD512)
#define SCRYPT_THROUGHPUT 16
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
#define SCRYPT_THROUGHPUT 2
//#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
// #define SCRYPT_THROUGHPUT 2
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#elif defined(__SSE2__) || defined(__ARM_NEON)
@@ -162,7 +162,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
}
#endif // throughput 1
//
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
@@ -1230,7 +1230,8 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input,
#if ( SCRYPT_THROUGHPUT == 4 )
#if defined(__SHA__)
#if 0
//#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thrid )
@@ -1244,6 +1245,15 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
memcpy( tstate+16, midstate, 32 );
memcpy( tstate+24, midstate, 32 );
HMAC_SHA256_80_init_SHA_2BUF( input, input+20, tstate, tstate+8,
ostate, ostate+8 );
PBKDF2_SHA256_80_128_SHA_2BUF( tstate, tstate+8, ostate, ostate+8,
input, input+20, W, W+32 );
HMAC_SHA256_80_init_SHA_2BUF( input+40, input+60, tstate+16, tstate+24,
ostate+16, ostate+24 );
PBKDF2_SHA256_80_128_SHA_2BUF( tstate+16, tstate+24, ostate+16, ostate+24,
input+40, input+60, W+64, W+96 );
/*
HMAC_SHA256_80_init( input, tstate, ostate );
PBKDF2_SHA256_80_128( tstate, ostate, input, W );
HMAC_SHA256_80_init( input +20, tstate+ 8, ostate+ 8 );
@@ -1252,7 +1262,7 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 );
HMAC_SHA256_80_init( input +60, tstate+24, ostate+24 );
PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 );
*/
/*
// Working Linear single threaded SIMD
scrypt_core_simd128( W, V, N );
@@ -1278,11 +1288,16 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
if ( work_restart[thrid].restart ) return 0;
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+ 8, ostate, ostate+ 8,
W, W+32, output, output+ 8 );
PBKDF2_SHA256_128_32_SHA_2BUF( tstate+16, tstate+24, ostate+16, ostate+24,
W+64, W+96, output+16, output+24 );
/*
PBKDF2_SHA256_128_32( tstate, ostate, W, output );
PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 );
PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 );
PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 );
*/
return 1;
}
@@ -1390,13 +1405,13 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
thr_id );
#elif ( SCRYPT_THROUGHPUT == 4 )
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
thr_id );
#else
// #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
// rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
// thr_id );
// #else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
thr_id );
#endif
// #endif
#elif ( SCRYPT_THROUGHPUT == 2 ) && ( defined(__SHA__) || defined(__ARM_FEATURE_SHA2) )
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
thr_id );
@@ -1444,11 +1459,6 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | SHA256_OPT | NEON_OPT;
#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#endif
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
opt_target_factor = 65536.0;
@@ -1477,8 +1487,9 @@ bool register_scrypt_algo( algo_gate_t* gate )
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
break;
default:
scratchbuf_size = opt_param_n; // 1 way
scratchbuf_size = opt_param_n * 128; // 1 way
}
char t_units[4] = {0};

View File

@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
#if defined(__AVX2__) && defined(__SHA512__)
#if defined(__AVX__) && defined(__SHA512__)
// SHA-512 implemented using SHA512 CPU extension.

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#include "sph_sha2.h"
#if defined(__SHA512__) && defined(__AVX2__)
#if defined(__SHA512__) && defined(__AVX__)
// Experimental, untested
// Need to substitute for sph_sha512

View File

@@ -305,7 +305,7 @@ do { \
xb0 = mm512_rol_32( xb0, 1 ); \
xa0 = mm512_xor3( xm, xb1, \
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, xb0 ); \
xb0 = mm512_nxor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_16 do { \
@@ -898,7 +898,7 @@ do { \
xb0 = mm256_rol_32( xb0, 1 ); \
xa0 = mm256_xor3( xm, xb1, \
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, xb0 ); \
xb0 = mm256_nxor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_8 do { \

View File

@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
{{ -30, 55, -58, -65, -95, -40, -98, 94 }},
};
#if defined(__AVX2__)
static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#define V128_00FF _mm256_castsi256_si128( V256_00FF )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#endif
#if defined(SIMD512)
static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V256_0101 _mm512_castsi512_si256( V512_0101 )
#define V128_0101 _mm512_castsi512_si128( V512_0101 )
static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V256_0080 _mm512_castsi512_si256( V512_0080 )
#define V128_0080 _mm512_castsi512_si128( V512_0080 )
#elif defined(__AVX2__)
static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V128_0101 _mm256_castsi256_si128( V256_0101 )
static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V128_0080 _mm256_castsi256_si128( V256_0080 )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
#endif
#if defined(__x86_64__)
#define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
#define shufxor(x,s) XCAT(SHUFXOR_,s)(x)
#define REDUCE(x) \
v128_sub16( v128_and( x, v128_64( \
0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\
v128_sub16( x, v128_and( \
v128_64( 0x0101010101010101 ), \
v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -293,10 +337,9 @@ do { \
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
do { \
v128u16_t t1= X(i); \
v128u16_t t2= X(j); \
X(i) = v128_unpacklo16( t1, t2 ); \
X(j) = v128_unpackhi16( t1, t2 ); \
v128u16_t t = X(i); \
X(i) = v128_unpacklo16( t, X(j) ); \
X(j) = v128_unpackhi16( t, X(j) ); \
} while(0)
INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =
#define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x)
#if defined(VL256)
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
_mm256_srai_epi16( x, 8 ) )
#else
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
#endif
_mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \
_mm256_set1_epi64x( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
_mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
_mm256_cmpgt_epi16( x, V256_0080 ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -917,10 +949,9 @@ do { \
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
do { \
__m256i t1= X(i); \
__m256i t2= X(j); \
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
__m256i t = X(i); \
X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
} while(0)
INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
_mm512_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \
_mm512_set1_epi64( 0x0101010101010101 ), \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
_mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )
// generic, except it calls targetted macros
#define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) )

View File

@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
#if defined(__AVX2__)
__m256i F0, F1, F2, F3, F4, F5, F6, F7;
__m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
__m256i *table = (__m256i*)fftTable;
__m256i tbl = table[ input[0] ];
__m256i *mul = (__m256i*)multipliers;
__m256i *out = (__m256i*)output;
F0 = _mm256_mullo_epi32( mul[0], tbl );
tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
tbl = table[ input[1] ];
F1 = _mm256_mullo_epi32( mul[1], tbl );
tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
tbl = table[ input[2] ];
F2 = _mm256_mullo_epi32( mul[2], tbl );
tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
tbl = table[ input[3] ];
F3 = _mm256_mullo_epi32( mul[3], tbl );
tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
tbl = table[ input[4] ];
F4 = _mm256_mullo_epi32( mul[4], tbl );
tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
tbl = table[ input[5] ];
F5 = _mm256_mullo_epi32( mul[5], tbl );
tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
tbl = table[ input[6] ];
F6 = _mm256_mullo_epi32( mul[6], tbl );
tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
tbl = table[ input[7] ];
F7 = _mm256_mullo_epi32( mul[7], tbl );
#define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
ADD_SUB( F1, F3 );
ADD_SUB( F4, F6 );
ADD_SUB( F5, F7 );
F5 = _mm256_slli_epi32( F5, 2 );
F6 = _mm256_slli_epi32( F6, 4 );
F7 = _mm256_slli_epi32( F7, 6 );
F5 = _mm256_slli_epi32( F5, 2 );
ADD_SUB( F0, F4 );
ADD_SUB( F1, F5 );
ADD_SUB( F2, F6 );

View File

@@ -4,11 +4,11 @@
# during development. However, the information contained may provide compilation
# tips to users.
rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null
# armv9 needs gcc-13
# -march-armv9-a includes SVE2 but no crypto
# -march=armv9-a+crypto adds AES & SHA2 but not SHA512
# -march=armv9-a+crypto adds AES & SHA256 but not SHA512
make distclean || echo clean
rm -f config.status
@@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-c
make -j $(nproc)
mv cpuminer cpuminer-armv9
# Apple M4: armv9.2, AES, SHA3, SVE2
make clean || echo clean
CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m4
# Apple M2: armv8.6, AES, SHA3
make clean || echo clean
CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m2
# SVE2 available in armv8.5
make clean || echo clean
CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2
# SHA3 available in armv8.4
# Apple M1: armv8.4 AES, SHA3
make clean || echo clean
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8.4-crypto-sha3
# Cortex-A76 (Orange Pi 5)
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8.2-crypto
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl

View File

@@ -34,9 +34,7 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-arrowlake-s
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
# Granitrapids does not build with AVX10, SHA512 or APX.
# wait for Diamondrapids & gcc-15.
# Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
@@ -44,13 +42,13 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-graniterapids
# SHA512 AVX10.1
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-avx10_1
# Graniterapids + SHA512, AVX10.1
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx10.1
# SHA512 AVX10.2
#make clean || echo clean
@@ -72,6 +70,9 @@ mv cpuminer cpuminer-graniterapids
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
# zen4 is close enough for older compiler
#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-zen5
@@ -138,13 +139,21 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx
# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
# SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-sse42-aes-sha
# SSE4.2 AES: Intel Westmere, older Pentium & Celeron
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-aes-sse42
mv cpuminer cpuminer-sse42-aes
# SSE4.2: Intel Nehalem
make clean || echo clean

View File

@@ -2,8 +2,8 @@
#
# make clean and rm all the targetted executables.
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null
make distclean > /dev/null

28
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 26.1.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.6'
PACKAGE_STRING='cpuminer-opt 25.6'
PACKAGE_VERSION='26.1'
PACKAGE_STRING='cpuminer-opt 26.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 26.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1431,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
short | recursive ) echo "Configuration of cpuminer-opt 26.1:";;
esac
cat <<\_ACEOF
@@ -1536,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 25.6
cpuminer-opt configure 26.1
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.6, which was
It was created by cpuminer-opt $as_me 26.1, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='25.6'
VERSION='26.1'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -5808,11 +5808,11 @@ if test x$ac_prog_cxx_stdcxx = xno
then :
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
printf %s "checking for $CXX option to enable C++11 features... " >&6; }
if test ${ac_cv_prog_cxx_cxx11+y}
if test ${ac_cv_prog_cxx_11+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_cv_prog_cxx_cxx11=no
ac_cv_prog_cxx_11=no
ac_save_CXX=$CXX
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
@@ -5854,11 +5854,11 @@ if test x$ac_prog_cxx_stdcxx = xno
then :
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
printf %s "checking for $CXX option to enable C++98 features... " >&6; }
if test ${ac_cv_prog_cxx_cxx98+y}
if test ${ac_cv_prog_cxx_98+y}
then :
printf %s "(cached) " >&6
else $as_nop
ac_cv_prog_cxx_cxx98=no
ac_cv_prog_cxx_98=no
ac_save_CXX=$CXX
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 25.6, which was
This file was extended by cpuminer-opt $as_me 26.1, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 25.6
cpuminer-opt config.status 26.1
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [25.6])
AC_INIT([cpuminer-opt], [26.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 26.1.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.6'
PACKAGE_STRING='cpuminer-opt 25.6'
PACKAGE_VERSION='26.1'
PACKAGE_STRING='cpuminer-opt 26.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems.
'configure' configures cpuminer-opt 26.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1424,7 +1424,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.6:";;
short | recursive ) echo "Configuration of cpuminer-opt 26.1:";;
esac
cat <<\_ACEOF
@@ -1528,7 +1528,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 25.6
cpuminer-opt configure 26.1
generated by GNU Autoconf 2.72
Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.6, which was
It was created by cpuminer-opt $as_me 26.1, which was
generated by GNU Autoconf 2.72. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3768,7 +3768,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='25.6'
VERSION='26.1'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7581,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 25.6, which was
This file was extended by cpuminer-opt $as_me 26.1, which was
generated by GNU Autoconf 2.72. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7649,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 25.6
cpuminer-opt config.status 26.1
configured by $0, generated by GNU Autoconf 2.72,
with options \\"\$ac_cs_config\\"

View File

@@ -921,39 +921,32 @@ out:
return rc;
}
// returns the unit prefix and the hashrate appropriately scaled.
void scale_hash_for_display ( double* hashrate, char* prefix )
{
if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
else { *prefix = 'Y'; *hashrate /= 1e24; }
}
// Does not account for leap years.
static inline void sprintf_et( char *str, long unsigned int seconds )
{
long unsigned int min = seconds / 60;
long unsigned int sec = seconds % 60;
long unsigned int hrs = min / 60;
if ( unlikely( hrs ) )
long unsigned int minutes = seconds / 60;
if ( minutes )
{
long unsigned int hours = minutes / 60;
if ( hours )
{
long unsigned int days = hours / 24;
if ( days )
{
long unsigned int days = hrs / 24;
long unsigned int years = days / 365;
if ( years ) // 0y000d
sprintf( str, "%luy%lud", years, years % 365 );
else if ( days ) // 0d00h
sprintf( str, "%lud%02luh", days, hrs % 24 );
else // 0h00m
sprintf( str, "%luh%02lum", hrs, min % 60 );
if ( years )
sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d
else
sprintf( str, "%lud%02luh", days, hours % 24 ); // 0d00h
}
else // 0m00s
sprintf( str, "%lum%02lus", min, sec );
else
sprintf( str, "%luh%02lum", hours, minutes % 60 ); // 0h00m
}
else
sprintf( str, "%lum%02lus", minutes, seconds % 60 ); // 0m00s
}
else
sprintf( str, "%lus", seconds ); // 0s
}
const long double exp32 = EXP32; // 2**32
@@ -1239,7 +1232,7 @@ static int share_result( int result, struct work *work,
sprintf( ares, "A%d", accepted_share_count );
sprintf( bres, "B%d", solved_block_count );
if ( reason )
stale = strstr( reason, "job" );
stale = strstr( reason, "job" ) || strstr( reason, "Job" );
else if ( work )
stale = work->data[ algo_gate.ntime_index ]
!= g_work.data[ algo_gate.ntime_index ];
@@ -2833,67 +2826,29 @@ static void show_credits()
static bool cpu_capability( bool display_only )
{
char cpu_brand[0x40];
bool cpu_has_sse2 = has_sse2(); // X86_64 only
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
bool cpu_has_sse41 = has_sse41(); // X86_64 only
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx();
bool cpu_has_neon = has_neon(); // AArch64
bool cpu_has_sve = has_sve(); // aarch64 only, insignificant
bool cpu_has_sve2 = has_sve2(); // AArch64 only
bool cpu_has_sme = has_sme();
bool cpu_has_sme2 = has_sme2();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_avx10 = has_avx10();
bool cpu_has_aes = has_aes(); // x86_64 or AArch64
bool cpu_has_vaes = has_vaes(); // X86_64 only
bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64
bool cpu_has_sha512 = has_sha512();
bool sw_has_x86_64 = false;
bool sw_has_aarch64 = false;
int sw_arm_arch = 0; // AArch64 version
bool sw_has_neon = false; // AArch64
bool sw_has_sve = false; // AArch64
bool sw_has_sve2 = false; // AArch64
bool sw_has_sve = false;
bool sw_has_sve2 = false;
bool sw_has_sme = false;
bool sw_has_sme2 = false;
bool sw_has_sse2 = false; // x86_64
bool sw_has_ssse3 = false; // x86_64
bool sw_has_sse41 = false; // x86_64
bool sw_has_ssse3 = false;
bool sw_has_sse41 = false;
bool sw_has_sse42 = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_avx512 = false;
bool sw_has_avx10 = false;
bool sw_has_aes = false;
bool sw_has_vaes = false;
bool sw_has_amx = false;
bool sw_has_apx = false;
bool sw_has_aes = false; // x86_64 or AArch64
bool sw_has_vaes = false; // x86_64
bool sw_has_sha256 = false; // x86_64 or AArch64
bool sw_has_sha512 = false; // x86_64 or AArch64
/*
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features );
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
bool algo_has_neon = set_incl( NEON_OPT, algo_features );
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_aes;
bool use_vaes;
bool use_sha256;
bool use_sha512;
bool use_neon;
bool use_none;
*/
bool sw_has_sha512 = false;
#if defined(__x86_64__)
sw_has_x86_64 = true;
#elif defined(__aarch64__)
@@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only )
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
sw_has_avx512 = true;
#endif
// AVX10 version is not significant as of AVX10.2. If that changes use a better
// way to test the version than sequentially.
// #if defined(__AVX10_2__)
//
// #elif defined(__AVX10_1__)
#if defined(__AVX10_1__)
#if defined(__AVX10_1__) // version is not significant
sw_has_avx10 = true;
#endif
#ifdef __AMX_TILE__
sw_has_amx = true;
#endif
#ifdef __APX_F__
sw_has_apx = true;
#endif
// x86_64 or AArch64
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
@@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only )
#if defined(__ARM_NEON)
sw_has_neon = true;
#endif
// FYI, SVE & SME not used by cpuminer
#if defined(__ARM_FEATURE_SVE)
sw_has_sve = true;
#endif
@@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only )
// Build
printf( "SW built on " __DATE__
#if defined(__clang__)
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
__clang_patchlevel__ );
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ );
#elif defined(__GNUC__)
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
#endif
@@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only )
printf("CPU features: ");
if ( cpu_arch_x86_64() )
{
if ( cpu_has_avx10 ) printf( " AVX10.%d", avx10_version() );
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " );
if ( has_avx10() ) printf( " AVX10.%d", avx10_version() );
else if ( has_avx512() ) printf( " AVX512" );
else if ( has_avx2() ) printf( " AVX2 " );
else if ( has_avx() ) printf( " AVX " );
else if ( has_sse42() ) printf( " SSE4.2" );
else if ( has_sse41() ) printf( " SSE4.1" );
else if ( has_ssse3() ) printf( " SSSE3 " );
else if ( has_sse2() ) printf( " SSE2 " );
if ( has_amx() ) printf( " AMX" );
if ( has_apx_f() ) printf( " APX" );
}
else if ( cpu_arch_aarch64() )
{
if ( cpu_has_neon ) printf( " NEON" );
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() );
else if ( cpu_has_sve ) printf( " SVE" );
if ( cpu_has_sme2 ) printf( " SME2" );
else if ( cpu_has_sme ) printf( " SME" );
if ( has_neon() ) printf( " NEON" );
if ( has_sve2() ) printf( " SVE2-%d", sve_vector_length() );
else if ( has_sve() ) printf( " SVE" );
if ( has_sme2() ) printf( " SME2" );
else if ( has_sme() ) printf( " SME" );
}
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" );
if ( has_vaes() ) printf( " VAES" );
else if ( has_aes() ) printf( " AES" );
if ( has_sha512() ) printf( " SHA512" );
else if ( has_sha256() ) printf( " SHA256" );
printf("\nSW features: ");
if ( sw_has_x86_64 )
@@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_amx ) printf( " AMX" );
if ( sw_has_apx ) printf( " APX" );
}
else if ( sw_has_aarch64 )
{

View File

@@ -542,7 +542,9 @@ void applog_hash(void *hash);
void format_hashrate(double hashrate, char *output);
void print_hash_tests(void);
// Factors of 1000 used for hashes, ie kH/s, Mh/s.
void scale_hash_for_display ( double* hashrate, char* units );
// Factors of 1024 used for bytes, ie kiB, MiB.
void format_number_si( double* hashrate, char* si_units );
void report_summary_log( bool force );

View File

@@ -447,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#define v128_nxor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else
@@ -469,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) )
#define v128_nxor( a, b ) v128_not( _mm_xor_si128( a, b ) )
#endif
@@ -642,6 +642,15 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_shuflr32(v) _mm_shuffle_epi32( v, 0x39 )
#define v128_shufll32(v) _mm_shuffle_epi32( v, 0x93 )
/* Zen6 AMD only
// Reverse bits in bytes
#if defined(__AVX512VL__) && defined(__AVX512BMM__)
#define v128_bitrev8 _mm_vbitrevb_epi8
#endif
*/
// Endian byte swap.
#if defined(__SSSE3__)

View File

@@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define mm256_xnor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 )
#define mm256_nxor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 )
#else
@@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) \
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
#define mm256_xnor( a, b ) \
#define mm256_nxor( a, b ) \
mm256_not( _mm256_xor_si256( a, b ) )
#endif
@@ -409,6 +409,15 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
*/
/* Zen6 AMD only
// Reverse bits in bytes
#if defined(__AVX512VL__) && defined(__AVX512BMM__)
#define mm256_bitrev8 _mm256_vbitrevb_epi8
#endif
*/
// Reverse byte order in elements, endian bswap.
#define mm256_bswap_64( v ) _mm256_shuffle_epi8( v, V256_BSWAP64 )

View File

@@ -18,8 +18,13 @@
// AVX512 intrinsics have a few changes from previous conventions.
//
// "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
// This removes the need for an explicit movemask instruction.
// "_mm512_cmp" instructions now return a bitmask instead of a vector mask.
// This removes the need for an explicit movemask instruction. It is also
// slower than AVX2 cmp. There is no version of AVX512 cmp that returns a
// vector result resulting in a double penalty if a vector result is needed:
// slower cmp instruction & extra instruction to convert bit mask into
// vector mask. 256 bit & 128 bit still have legacy cmp returning vector
// while AVX512VL adds masked versions returning bit mask.
//
// Many previously sizeless (si) instructions now have sized (epi) versions
// to accomodate masking packed elements.
@@ -30,7 +35,7 @@
// list.
//
// "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
// AVX512 permutes can cross all lanes.
// AVX512 instructions using the permute name can cross all lanes.
//
// New alignr instructions for epi64 and epi32 operate across the entire
// vector but slower than epi8 which continues to be restricted to 128 bit
@@ -50,16 +55,17 @@
// parentheses to ensure the expression argument is evaluated first.
// - if an argument is to referenced multiple times a C inline function
// should be used instead of a macro to prevent an expression argument
// from being evaluated multiple times (wasteful) or produces side
// from being evaluated multiple times (wasteful) or produce side
// effects (very bad).
//
// There are 2 areas where overhead is a major concern: constants and
// permutations.
//
// Constants need to be composed at run time by assembling individual
// elements, very expensive. The cost is proportional to the number of
// different elements therefore use the largest element size possible,
// merge smaller integer elements to 64 bits, and group repeated elements.
// elements or loaded from memory, very expensive. The cost of runtime
// construction is proportional to the number of different elements
// therefore use the largest element size possible merging smaller integer
// elements to 64 bits, and group repeated elements.
//
// Constants with repeating patterns can be optimized with the smaller
// patterns repeated more frequently being more efficient.
@@ -67,14 +73,15 @@
// Some specific constants can be very efficient. Zero is very efficient,
// 1 and -1 slightly less so.
//
// If an expensive constant is to be reused in the same function it should
// be declared as a local variable defined once and reused.
// If an expensive constant is to be reused in the same function it may
// be declared as a local variable defined once and reused. If frequently
// used it can be declared as a static const in memory.
//
// Permutations can be very expensive if they use a vector control index,
// even if the permutation itself is quite efficient.
// The index is essentially a constant with all the baggage that brings.
// The same rules apply, if an index is to be reused it should be defined
// as a local. This applies specifically to bswap operations.
// even if the permute instruction itself is quite efficient.
// The index is essentially a vector constant with all the baggage that
// brings. The same rules apply, if an index is to be reused it should either
// be defined as a local or static const.
//
// Permutations that cross 128 bit lanes are typically slower and often need
// a vector control index. If the permutation doesn't need to cross 128 bit
@@ -221,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 )
// ~( a ^ b ), (~a) ^ b
#define mm512_xnor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 )
#define mm512_nxor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 )
// ~( a & b )
#define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef )
@@ -241,6 +248,24 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_ror_32 _mm512_ror_epi32
#define mm512_rol_32 _mm512_rol_epi32
/* not used
#if defined(__AVX512VBMI2__)
#define mm512_ror_16( v, c ) _mm512_shrdi_epi16( c, v, v )
#define mm512_rol_16( v, c ) _mm512_shldi_epi16( c, v, v )
#endif
*/
/* Zen6 AMD only
// Reverse bits in bytes
#if defined(__AVX512BMM__)
#define mm512_bitrev8 _mm512_vbitrevb_epi8
#endif
*/
//
// Reverse byte order of packed elements, vectorized endian conversion.
@@ -249,9 +274,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 )
/* not used
#if defined(__AVX512VBMI2__)
#define mm512_bswap_16( v ) mm512_ror_16( v, 8 )
#else
#define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
#endif
*/
#define mm512_bswap_16( v ) \
@@ -431,8 +464,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
_mm512_castsi512_ps( v2 ), c ) );
// 64 bit lanes
// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
// Redundant with ror & rol but included for consistency with AVX2/SSE.
#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_swap64_32 mm512_qrev32 // grandfathered

View File

@@ -126,7 +126,7 @@
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
// ~( v1 ^ v0 ), same as (~v1) ^ v0
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
#define v128_nxor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, args reversed for consistency with x86_64
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
@@ -349,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Bit rotation
/*
#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 )
#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 )
#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuflr64_16( v ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
: ( (c) == 48 ) ? v128_shufll64_16( v ) \
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shufll64_16( v ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
: ( (c) == 48 ) ? v128_shuflr64_16( v ) \
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
*/
#define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
@@ -477,8 +431,10 @@ static inline uint32x4_t v128_shuflr32( uint32x4_t v )
static inline uint32x4_t v128_shufll32( uint32x4_t v )
{ return vextq_u32( v, v, 3 ); }
// reverse bits in bytes, nothing like it in x86_64
/* not used
// reverse bits in bytes, nothing like it in x86_64 until Zen6
#define v128_bitrev8 vrbitq_u8
*/
// reverse byte order
#define v128_bswap16(v) (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )

View File

@@ -16,8 +16,8 @@
#include "miner.h"
#include "simd-utils.h"
// Missing on MinGW, MacOS
#if defined(__aarch64__) && !defined(WIN32) && !defined(__APPLE__)
// hwcap.h missing on MinGW, MacOS
#if defined(__aarch64__) && !(defined(WIN32) || defined(__APPLE__))
#define ARM_AUXV
#endif
@@ -191,6 +191,7 @@ static inline int cpu_fanpercent()
#define CPU_INFO (1)
#define CACHE_TLB_DESCRIPTOR (2)
#define EXTENDED_FEATURES (7)
#define EXTENDED_FEATURE_ID (0x21)
#define AVX10_FEATURES (0x24)
#define HIGHEST_EXT_FUNCTION (0x80000000)
#define EXTENDED_CPU_INFO (0x80000001)
@@ -254,8 +255,8 @@ static inline int cpu_fanpercent()
#define AVX512_BF16_Flag (1<< 5)
#define AMX_FP16_Flag (1<<21)
#define AVX_IFMA_Flag (1<<23)
#define MOVRS_Flag (1<<31) // Both names are referenced in docs
#define AVX10_MOVRS_Flag (1<<31)
#define MOVRS_Flag (1<<31) // Both names are referenced in docs
// EDX
#define AVX_VNNI_INT8_Flag (1<< 4)
#define AVX_NE_CONVERT_Flag (1<< 5)
@@ -264,6 +265,10 @@ static inline int cpu_fanpercent()
#define AVX10_Flag (1<<19)
#define APX_F_Flag (1<<21)
// EXTENDED_FEATURE_ID: EAX=0x21, ECX=0
// EAX
#define AVX512_BMM_Flag (1<<23) // Zen6 AMD only
// AVX10_FEATURES: EAX=0x24, ECX=0
// EBX
#define AVX10_VERSION_mask 0xff // bits [7:0]
@@ -474,6 +479,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
}
/*
// ARM feature compiler flags
#ifdef __aarch64__
#warning "__aarch64__"
#endif
@@ -509,16 +515,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
#endif
*/
// Typical display format: AVX10.[version]_[vectorlength], if vector length is
// omitted 256 is the default.
// Ex: AVX10.1_512
// Flags:
// AVX10 128 256 512
// 0 0 0 0 = AVX10 not supported
// 1 1 1 0 = AVX10 256 bit max (version 2)
// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids)
// Other combinations are not defined.
static inline bool cpu_arch_x86_64()
{
#if defined(__x86_64__)
@@ -744,7 +740,7 @@ static inline bool has_avx512()
#endif
}
static inline bool has_vbmi()
static inline bool has_avx512vbmi()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
@@ -755,7 +751,7 @@ static inline bool has_vbmi()
#endif
}
static inline bool has_vbmi2()
static inline bool has_avx512vbmi2()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
@@ -766,6 +762,29 @@ static inline bool has_vbmi2()
#endif
}
// Zen6 AMD only
static inline bool has_avx512bmm()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURE_ID, 0, cpu_info );
return cpu_info[ EAX_Reg ] & AVX512_BMM_Flag;
#else
return false;
#endif
}
static inline bool has_amx()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EDX_Reg ] & AMX_TILE_Flag;
#else
return false;
#endif
}
static inline bool has_aes()
{
#if defined(__x86_64__)
@@ -815,13 +834,9 @@ static inline bool has_sveaes()
static inline bool has_sha256()
{
#if defined(__x86_64__)
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & SHA_Flag;
}
return false;
#elif defined(__aarch64__) && defined(HWCAP_SHA2)
// NEON SHA256
unsigned int cpu_info[4] = { 0 };
@@ -835,7 +850,7 @@ static inline bool has_sha256()
static inline bool has_sha512()
{
#if defined(__x86_64__)
if ( has_avx2() )
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
@@ -852,7 +867,7 @@ static inline bool has_sha512()
#endif
}
// Arm only
// ARM64 only
static inline bool has_sha3()
{
#if defined(__aarch64__) && defined(HWCAP_SHA3)
@@ -944,16 +959,6 @@ static inline int sve_vector_length()
return 0;
}
// Assume min_vlen refers to the register size
static inline int rvv_vector_length()
{
#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return __riscv_v_min_vlen;
#endif
return 0;
}
// generic
static inline int vector_length()
{
#if defined(__x86_64__)
@@ -965,8 +970,8 @@ static inline int vector_length()
return has_sve() ? sve_vector_length()
: has_neon() ? 128
: 0;
#elif defined(__riscv) && defined(__riscv_vector)
return rvv_vector_length();
#elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return __riscv_v_min_vlen;
#endif
return 0;
}

49
util.c
View File

@@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
free(cmd);
}
void format_hashrate(double hashrate, char *output)
// Decimal SI, factors 0f 1000
void scale_hash_for_display ( double* hashrate, char* prefix )
{
char prefix = '\0';
if (hashrate < 10000) {
// nop
}
else if (hashrate < 1e7) {
prefix = 'k';
hashrate *= 1e-3;
}
else if (hashrate < 1e10) {
prefix = 'M';
hashrate *= 1e-6;
}
else if (hashrate < 1e13) {
prefix = 'G';
hashrate *= 1e-9;
}
else {
prefix = 'T';
hashrate *= 1e-12;
}
sprintf(
output,
prefix ? "%.2f %cH/s" : "%.2f H/s%c",
hashrate, prefix
);
if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
else { *prefix = 'Y'; *hashrate /= 1e24; }
}
// For use with MiB etc
void format_hashrate( double hashrate, char *output )
{
char prefix = '\0';
scale_hash_for_display( &hashrate, &prefix );
sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix );
}
// Binary SI, factors of 1024
void format_number_si( double* n, char* si_units )
{
if ( *n < 1024*10 ) { *si_units = 0; return; }

View File

@@ -55,9 +55,9 @@ the current directory it will be created.
Data file creation can take up to 30 minutes on a spinning hard drive.
Once created the new data file will be verified and used immediately
if a valid url and user were included on the command line.
if a valid url and user was included on the command line.
A default data file can be created by ommitting the url option. That will
A default data file can also be created by ommitting the url option. That will
either verify an existing default data file or create one and verify it,
then exit.