Compare commits

..

2 Commits
v25.5 ... v25.7

Author SHA1 Message Date
Jay D Dee
8f2f9ec3e9 v25.7 2025-11-15 10:44:32 -05:00
Jay D Dee
12480a3ea5 v25.6 2025-07-20 19:43:10 -04:00
36 changed files with 960 additions and 935 deletions

View File

@@ -54,9 +54,9 @@ Supported Algorithms
allium Garlicoin
anime Animecoin
argon2 Argon2 coin (AR2)
argon2d250
argon2d500
argon2d1000
argon2d4096
blake Blake-256
blake2b Blake2-512

View File

@@ -75,9 +75,22 @@ If not what makes it happen or not happen?
Change Log
----------
v25.7
Fixed a bug calculating TTF longer than 1 year.
Faster argon2d.
Faster hamsi AVX512.
Faster switfftx AVX2.
Other small fixes and improvements.
v25.6
Added argon2d1000, argon2d16000 algos.
Target specific AES optimizations improve shavite for ARM64 & x86_64.
v25.5
x86_64: Fixed and insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512.
x86_64: Faster sha256d, sha256dt for AVX2 & AVX512.
Other small bug fixes.

View File

@@ -297,6 +297,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break;
case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break;
case ALGO_ARGON2D1000: rc = register_argon2d1000_algo ( gate ); break;
case ALGO_ARGON2D16000: rc = register_argon2d16000_algo ( gate ); break;
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break;
case ALGO_BLAKE: rc = register_blake_algo ( gate ); break;

View File

@@ -172,8 +172,11 @@ void ( *set_work_data_endian ) ( struct work* );
json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* );
// Deprecated
set_t optimizations;
int ( *get_work_data_size ) ();
int ntime_index;
int nbits_index;
int nonce_index; // use with caution, see warning below
@@ -274,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
// OpenSSL sha256 deprecated
//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
bool std_le_work_decode( struct work *work );
bool std_be_work_decode( struct work *work );

View File

@@ -6,6 +6,38 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu
static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
// generic, works with most variations of argon2d
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const int thr_id = mythr->id;
const uint32_t first_nonce = (const uint32_t)pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t nonce = first_nonce;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
do
{
edata[19] = nonce;
algo_gate.hash( hash, edata, thr_id );
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( nonce );
submit_solution( work, hash, mythr );
}
nonce++;
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
void argon2d250_hash( void *output, const void *input )
{
argon2_context context;
@@ -32,41 +64,10 @@ void argon2d250_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
swab32_array( edata, pdata, 20 );
do {
be32enc(&edata[19], nonce);
argon2d250_hash( hash, edata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
{
pdata[19] = nonce;
submit_solution( work, hash, mythr );
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
bool register_argon2d250_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d250;
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d250_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 65536.0;
return true;
}
@@ -97,43 +98,78 @@ void argon2d500_hash( void *output, const void *input )
argon2_ctx( &context, Argon2_d );
}
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) edata[20];
uint32_t _ALIGN(64) hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const int thr_id = mythr->id;
const uint32_t first_nonce = (const uint32_t)pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t nonce = first_nonce;
const bool bench = opt_benchmark;
v128_bswap32_80( edata, pdata );
do
{
edata[19] = nonce;
argon2d500_hash( hash, edata );
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
&& !bench ) )
{
pdata[19] = bswap_32( nonce );;
submit_solution( work, hash, mythr );
}
nonce++;
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 0;
}
bool register_argon2d500_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d500;
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d500_hash;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
opt_target_factor = 65536.0;
return true;
}
void argon2d1000_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
context.outlen = (uint32_t)OUTPUT_BYTES;
context.pwd = (uint8_t *)input;
context.pwdlen = (uint32_t)INPUT_BYTES;
context.salt = (uint8_t *)input; //salt = input
context.saltlen = (uint32_t)INPUT_BYTES;
context.secret = NULL;
context.secretlen = 0;
context.ad = NULL;
context.adlen = 0;
context.allocate_cbk = NULL;
context.free_cbk = NULL;
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
// main configurable Argon2 hash parameters
context.m_cost = 1000; // Memory in KiB (1MB)
context.lanes = 8; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 2; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
bool register_argon2d1000_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d1000_hash;
opt_target_factor = 65536.0;
return true;
}
void argon2d16000_hash( void *output, const void *input )
{
argon2_context context;
context.out = (uint8_t *)output;
context.outlen = (uint32_t)OUTPUT_BYTES;
context.pwd = (uint8_t *)input;
context.pwdlen = (uint32_t)INPUT_BYTES;
context.salt = (uint8_t *)input; //salt = input
context.saltlen = (uint32_t)INPUT_BYTES;
context.secret = NULL;
context.secretlen = 0;
context.ad = NULL;
context.adlen = 0;
context.allocate_cbk = NULL;
context.free_cbk = NULL;
context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
// main configurable Argon2 hash parameters
context.m_cost = 16000; // Memory in KiB (~16384KB)
context.lanes = 1; // Degree of Parallelism
context.threads = 1; // Threads
context.t_cost = 1; // Iterations
context.version = ARGON2_VERSION_10;
argon2_ctx( &context, Argon2_d );
}
bool register_argon2d16000_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d;
gate->hash = (void*)&argon2d16000_hash;
opt_target_factor = 65536.0;
return true;
}
@@ -148,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = (const uint32_t)max_nonce;
uint32_t n = first_nonce;
const int thr_id = mythr->id; // thr_id arg is deprecated
const int thr_id = mythr->id;
uint32_t t_cost = 1; // 1 iteration
uint32_t m_cost = 4096; // use 4MB
uint32_t parallelism = 1; // 1 thread, 2 lanes
@@ -176,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
bool register_argon2d4096_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_argon2d4096;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
opt_target_factor = 65536.0;
return true;
}

View File

@@ -4,22 +4,27 @@
#include "algo-gate-api.h"
#include <stdint.h>
int scanhash_argon2d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Credits: version = 0x10, m_cost = 250.
bool register_argon2d250_algo( algo_gate_t* gate );
void argon2d250_hash( void *state, const void *input );
int scanhash_argon2d250( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Dynamic: version = 0x10, m_cost = 500.
bool register_argon2d500_algo( algo_gate_t* gate );
void argon2d500_hash( void *state, const void *input );
int scanhash_argon2d500( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
// Zero Dynamics Cash: version = 0x10, m_cost = 1000.
bool register_argon2d1000_algo( algo_gate_t* gate );
void argon2d1000_hash( void *state, const void *input );
bool register_argon2d16000_algo( algo_gate_t* gate );
void argon2d16000_hash( void *state, const void *input );
// Unitus: version = 0x13, m_cost = 4096.
bool register_argon2d4096_algo( algo_gate_t* gate );

View File

@@ -66,82 +66,60 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#if defined(__SSSE3__) || defined(__ARM_NEON)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0 = v128_alignr8(B1, B0, 8); \
v128_t t1 = v128_alignr8(B0, B1, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = v128_alignr8(D1, D0, 8); \
t1 = v128_alignr8(D0, D1, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = v128_alignr8( B1, B0, 8 ); \
B1 = v128_alignr8( B0, B1, 8 ); \
B0 = t; \
t = v128_alignr8( D1, D0, 8 ); \
D0 = v128_alignr8( D0, D1, 8 ); \
D1 = t; \
}
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0 = v128_alignr8(B0, B1, 8); \
v128_t t1 = v128_alignr8(B1, B0, 8); \
B0 = t0; \
B1 = t1; \
\
t0 = C0; \
C0 = C1; \
C1 = t0; \
\
t0 = v128_alignr8(D0, D1, 8); \
t1 = v128_alignr8(D1, D0, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = v128_alignr8( B0, B1, 8 ); \
B1 = v128_alignr8( B1, B0, 8 ); \
B0 = t; \
t = v128_alignr8( D0, D1, 8 ); \
D0 = v128_alignr8( D1, D0, 8 ); \
D1 = t; \
}
#else /* SSE2 */
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0 = D0; \
v128_t t1 = B0; \
D0 = C0; \
C0 = C1; \
C1 = D0; \
D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \
D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \
B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1)); \
B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \
} while ((void)0, 0)
#define DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = D0; \
D0 = v128_unpackhi64( D1, v128_unpacklo64( D0, D0 ) ); \
D1 = v128_unpackhi64( t, v128_unpacklo64( D1, D1 ) ); \
t = B0; \
B0 = v128_unpackhi64( B0, v128_unpacklo64( B1, B1 ) ); \
B1 = v128_unpackhi64( B1, v128_unpacklo64( t, t ) ); \
}
#define UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
v128_t t = B0; \
B0 = v128_unpackhi64( B1, v128_unpacklo64( B0, B0 ) ); \
B1 = v128_unpackhi64( t, v128_unpacklo64( B1, B1 ) ); \
t = D0; \
D0 = v128_unpackhi64( D0, v128_unpacklo64( D1, D1 ) ); \
D1 = v128_unpackhi64( D1, v128_unpacklo64( t, t ) ); \
}
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
v128_t t0, t1; \
t0 = C0; \
C0 = C1; \
C1 = t0; \
t0 = B0; \
t1 = D0; \
B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0)); \
B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \
D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1)); \
D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \
} while ((void)0, 0)
#endif
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0)
#define BLAKE2_ROUND( A0, A1, B0, B1, C0, C1, D0, D1 ) \
{ \
G1( A0, B0, C0, D0, A1, B1, C1, D1 ); \
G2( A0, B0, C0, D0, A1, B1, C1, D1 ); \
DIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
G1( A0, B0, C1, D0, A1, B1, C0, D1 ); \
G2( A0, B0, C1, D0, A1, B1, C0, D1 ); \
UNDIAGONALIZE( A0, B0, C0, D0, A1, B1, C1, D1 ); \
}
#else /* __AVX2__ */
#include <immintrin.h>
@@ -211,7 +189,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
@@ -219,17 +196,14 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
tmp1 = C0; \
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
C1 = tmp1; \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0x33); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0xCC); \
B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
} while(0);
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -237,7 +211,6 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
\
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
@@ -247,27 +220,21 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do { \
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
tmp1 = C0; \
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
C0 = C1; \
B0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
B1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
C1 = tmp1; \
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
D1 = _mm256_shuffle_epi32( tmp2, 0x4e ); \
D0 = _mm256_shuffle_epi32( tmp1, 0x4e ); \
} while((void)0, 0);
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
} while((void)0, 0);
@@ -275,12 +242,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
do{ \
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
\
G1_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
G2_AVX2(A0, A1, B0, B1, C1, C0, D0, D1) \
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
} while((void)0, 0);
@@ -290,12 +254,73 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
#include <immintrin.h>
/*
static inline __m512i muladd(__m512i x, __m512i y)
{
__m512i z = _mm512_mul_epu32(x, y);
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
}
*/
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 32); \
D1 = _mm512_ror_epi64(D1, 32); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
}
#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
__m512i z0, z1; \
z0 = _mm512_mul_epu32( A0, B0 ); \
z1 = _mm512_mul_epu32( A1, B1 ); \
A0 = _mm512_add_epi64( A0, B0 ); \
A1 = _mm512_add_epi64( A1, B1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
A0 = _mm512_add_epi64( A0, z0 ); \
A1 = _mm512_add_epi64( A1, z1 ); \
D0 = _mm512_xor_si512(D0, A0); \
D1 = _mm512_xor_si512(D1, A1); \
D0 = _mm512_ror_epi64(D0, 16); \
D1 = _mm512_ror_epi64(D1, 16); \
z0 = _mm512_mul_epu32( C0, D0 ); \
z1 = _mm512_mul_epu32( C1, D1 ); \
C0 = _mm512_add_epi64( C0, D0 ); \
C1 = _mm512_add_epi64( C1, D1 ); \
z0 = _mm512_add_epi64( z0, z0 ); \
z1 = _mm512_add_epi64( z1, z1 ); \
C0 = _mm512_add_epi64( C0, z0 ); \
C1 = _mm512_add_epi64( C1, z1 ); \
B0 = _mm512_xor_si512(B0, C0); \
B1 = _mm512_xor_si512(B1, C1); \
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
}
/*
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = muladd(A0, B0); \
@@ -316,7 +341,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 24); \
B1 = _mm512_ror_epi64(B1, 24); \
} while ((void)0, 0)
*/
/*
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = muladd(A0, B0); \
@@ -337,15 +363,14 @@ static inline __m512i muladd(__m512i x, __m512i y)
B0 = _mm512_ror_epi64(B0, 63); \
B1 = _mm512_ror_epi64(B1, 63); \
} while ((void)0, 0)
*/
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
} while ((void)0, 0)
@@ -354,10 +379,8 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
\
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
\
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
} while ((void)0, 0)
@@ -366,15 +389,17 @@ static inline __m512i muladd(__m512i x, __m512i y)
do { \
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
\
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
\
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
} while ((void)0, 0)
static const __m512i swap_q0 = { 0,1, 8,9, 2,3, 10,11 };
static const __m512i swap_q1 = { 4,5, 12,13, 6,7, 14,15 };
static const __m512i uswap_q0 = { 0,1, 4,5, 8,9, 12,13 };
static const __m512i uswap_q1 = { 2,3, 6,7, 10,11, 14,15 };
#define SWAP_HALVES(A0, A1) \
do { \
__m512i t; \
@@ -383,19 +408,36 @@ static inline __m512i muladd(__m512i x, __m512i y)
A0 = t; \
} while((void)0, 0)
#define SWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, swap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, swap_q1, A1 ); \
A0 = t; \
}
#define UNSWAP_QUARTERS(A0, A1) \
{ \
__m512i t = _mm512_permutex2var_epi64( A0, uswap_q0, A1 ); \
A1 = _mm512_permutex2var_epi64( A0, uswap_q1, A1 ); \
A0 = t; \
}
/*
#define SWAP_QUARTERS(A0, A1) \
do { \
SWAP_HALVES(A0, A1); \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
} while((void)0, 0)
*/
/*
#define UNSWAP_QUARTERS(A0, A1) \
do { \
A0 = _mm512_shuffle_i64x2( A0, A0, 0xd8 ); \
A1 = _mm512_shuffle_i64x2( A1, A1, 0xd8 ); \
SWAP_HALVES(A0, A1); \
} while((void)0, 0)
*/
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
do { \

View File

@@ -683,8 +683,9 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
mj[14] = mm256_rol_64( M[14], 15 );
mj[15] = mm256_rol_64( M[15], 16 );
__m256i K = _mm256_set1_epi64x( 16 * 0x0555555555555555ULL );
const __m256i Kincr = _mm256_set1_epi64x( 0x0555555555555555ULL );
__m256i K = _mm256_set1_epi64x( 0x5555555555555550ULL );
static const __m256i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm256_add_epi64( K, Kincr );
@@ -1094,7 +1095,7 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
__m512i dH[16] )
{
__m512i qt[32], xl, xh;
__m512i mh[16];
__m512i mh[16], mj[16];
int i;
for ( i = 0; i < 16; i++ )
@@ -1117,8 +1118,6 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
qt[14] = _mm512_add_epi64( s8b4( W8b14), H[15] );
qt[15] = _mm512_add_epi64( s8b0( W8b15), H[ 0] );
__m512i mj[16];
mj[ 0] = mm512_rol_64( M[ 0], 1 );
mj[ 1] = mm512_rol_64( M[ 1], 2 );
mj[ 2] = mm512_rol_64( M[ 2], 3 );
@@ -1136,8 +1135,11 @@ void compress_big_8way( const __m512i *M, const __m512i H[16],
mj[14] = mm512_rol_64( M[14], 15 );
mj[15] = mm512_rol_64( M[15], 16 );
__m512i K = _mm512_set1_epi64( 16 * 0x0555555555555555ULL );
const __m512i Kincr = _mm512_set1_epi64( 0x0555555555555555ULL );
__m512i K = _mm512_set1_epi64( 0x5555555555555550ULL );
static const __m512i Kincr = { 0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL,
0x0555555555555555ULL, 0x0555555555555555ULL };
qt[16] = add_elt_b8( mj[ 0], mj[ 3], mj[10], H[ 7], K );
K = _mm512_add_epi64( K, Kincr );

View File

@@ -503,32 +503,28 @@ do { \
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
s4 = mm512_swap64_32( s4 ); \
s5 = mm512_swap64_32( s5 ); \
t0 = _mm512_mask_shuffle_epi32( s4, 0xaaaa, s5, 0xb1 ); \
sD = mm512_swap64_32( sD ); \
sE = mm512_swap64_32( sE ); \
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
t1 = _mm512_mask_shuffle_epi32( sD, 0xaaaa, sE, 0xb1 ); \
L8( s0, t0, s9, t1 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
t2 = _mm512_mask_shuffle_epi32( s6, 0x5555, s5, 0xb1 ); \
t3 = _mm512_mask_shuffle_epi32( sF, 0x5555, sE, 0xb1 ); \
L8( s1, t2, sA, t3 ); \
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
\
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
t4 = _mm512_mask_shuffle_epi32( s6, 0xaaaa, s7, 0xb1 ); \
t5 = _mm512_mask_shuffle_epi32( sF, 0xaaaa, sC, 0xb1 ); \
L8( s2, t4, sB, t5 ); \
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
s6 = mm512_swap64_32( s6 ); \
sF = mm512_swap64_32( sF ); \
\
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
t2 = _mm512_mask_shuffle_epi32( s4, 0x5555, s7, 0xb1 ); \
t3 = _mm512_mask_shuffle_epi32( sD, 0x5555, sC, 0xb1 ); \
L8( s3, t2, s8, t3 ); \
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
@@ -537,21 +533,20 @@ do { \
s7 = mm512_swap64_32( s7 ); \
sC = mm512_swap64_32( sC ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
t0 = _mm512_mask_shuffle_epi32( s0, 0xaaaa, s8, 0xb1 ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
t2 = _mm512_mask_shuffle_epi32( sA, 0x5555, s2, 0xb1 ); \
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
t3 = mm512_swap64_32( t3 ); \
L8( t0, t1, t2, t3 ); \
t3 = mm512_swap64_32( t3 ); \
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
s8 = _mm512_mask_shuffle_epi32( s8, 0x5555, t0, 0xb1 ); \
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
s2 = _mm512_mask_shuffle_epi32( s2, 0xaaaa, t2, 0xb1 ); \
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
s3 = _mm512_mask_shuffle_epi32( s3, 0xaaaa, t3, 0xb1 ); \
sB = _mm512_mask_shuffle_epi32( sB, 0x5555, t3, 0xb1 ); \
\
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
@@ -1268,7 +1263,7 @@ do { \
} while (0)
#endif
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
// v3, 15 instructions
#define SBOX( a, b, c, d ) \
{ \
__m256i tb, td; \
@@ -1286,7 +1281,7 @@ do { \
#endif
/*
/ v2, 16 instructions, 10 TL equivalent instructions
/ v2, 16 instructions
#define SBOX( a, b, c, d ) \
{ \
__m256i t = mm256_xorand( d, a, c ); \

View File

@@ -80,14 +80,14 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
__m512i t = a0; \
a0 = mm512_xoror( a3, a0, a1 ); \
a2 = _mm512_xor_si512( a2, a3 ); \
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a1 = _mm512_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm512_xorand( a2, a3, t ); \
a2 = mm512_xorand( a1, a2, a0); \
a1 = _mm512_or_si512( a1, a3 ); \
a3 = _mm512_xor_si512( a3, a2 ); \
t = _mm512_xor_si512( t, a1 ); \
a2 = _mm512_and_si512( a2, a1 ); \
a1 = mm512_xnor( a1, a0 ); \
a1 = mm512_nxor( a1, a0 ); \
a0 = t; \
}
@@ -527,14 +527,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
__m256i t = a0; \
a0 = mm256_xoror( a3, a0, a1 ); \
a2 = _mm256_xor_si256( a2, a3 ); \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 nxor (a3 & t) */ \
a3 = mm256_xorand( a2, a3, t ); \
a2 = mm256_xorand( a1, a2, a0); \
a1 = _mm256_or_si256( a1, a3 ); \
a3 = _mm256_xor_si256( a3, a2 ); \
t = _mm256_xor_si256( t, a1 ); \
a2 = _mm256_and_si256( a2, a1 ); \
a1 = mm256_xnor( a1, a0 ); \
a1 = mm256_nxor( a1, a0 ); \
a0 = t; \
}

View File

@@ -69,18 +69,18 @@
v128_t t = a0; \
a0 = v128_xoror( a3, a0, a1 ); \
a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* ~a1 ^ (a3 & t) */ \
a3 = v128_xorand( a2, a3, t ); \
a2 = v128_xorand( a1, a2, a0 ); \
a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \
a1 = v128_xnor( a1, a0 ); \
a1 = v128_nxor( a1, a0 ); \
a0 = t; \
}
#else
#elif defined(__ARM_NEON) || defined(__SSE2__)
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \

View File

@@ -67,7 +67,7 @@ static const uint64_t K512[80] =
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
#if defined(__AVX2__) && defined(__SHA512__)
#if defined(__AVX__) && defined(__SHA512__)
// SHA-512 implemented using SHA512 CPU extension.

View File

@@ -5,7 +5,7 @@
#include "simd-utils.h"
#include "sph_sha2.h"
#if defined(__SHA512__) && defined(__AVX2__)
#if defined(__SHA512__) && defined(__AVX__)
// Experimental, untested
// Need to substitute for sph_sha512

View File

@@ -305,7 +305,7 @@ do { \
xb0 = mm512_rol_32( xb0, 1 ); \
xa0 = mm512_xor3( xm, xb1, \
mm512_xorandnot( v512_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm512_xnor( xa0, xb0 ); \
xb0 = mm512_nxor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_16 do { \
@@ -898,7 +898,7 @@ do { \
xb0 = mm256_rol_32( xb0, 1 ); \
xa0 = mm256_xor3( xm, xb1, \
mm256_xorandnot( v256_mult_x3( xa0 ), xb3, xb2 ) ); \
xb0 = mm256_xnor( xa0, xb0 ); \
xb0 = mm256_nxor( xa0, xb0 ); \
} while (0)
#define PERM_STEP_0_8 do { \

View File

@@ -109,7 +109,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
// round 1, 5, 9
k00 = _mm256_xor_si256( k13, mm256_shuflr128_32(
mm256_aesenc_2x128( k00, zero ) ) );

View File

@@ -21,7 +21,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
__m512i *H = (__m512i*)ctx->h;
const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2,
ctx->count1, ctx->count0 );
int r;
const __m512i zero = _mm512_setzero_si512();
P0 = H[0];
P1 = H[1];
@@ -37,182 +37,160 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
K6 = M[6];
K7 = M[7];
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
// round 0
P0 = _mm512_xor_si512( P0, X );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
P2 = _mm512_xor_si512( P2, X );
// round
for ( r = 0; r < 3; r ++ )
for ( int r = 0; r < 3; r ++ )
{
// round 1, 5, 9
// round 1, 5, 9
K0 = _mm512_xor_si512( K7, mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ) );
_mm512_aesenc_epi128( K0, zero ) ) );
if ( r == 0 )
K0 = _mm512_xor_si512( K0,
_mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) );
_mm512_mask_ternarylogic_epi32( count, 0x8888, count, count, 1 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
K1 = _mm512_xor_si512( K0,
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K1, zero ) ) );
if ( r == 1 )
K1 = _mm512_xor_si512( K1, mm512_shuflr128_32(
_mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) );
_mm512_mask_ternarylogic_epi32( count, 0x1111, count, count, 1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( K1,
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
mm512_shuflr128_32( _mm512_aesenc_epi128( K2, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( K2,
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P3 = _mm512_xor_si512( P3, X );
mm512_shuflr128_32( _mm512_aesenc_epi128( K3, zero ) ) );
P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );
K4 = _mm512_xor_si512( K3,
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
mm512_shuflr128_32( _mm512_aesenc_epi128( K4, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
K5 = _mm512_xor_si512( K4,
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
mm512_shuflr128_32( _mm512_aesenc_epi128( K5, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( K5,
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( K6,
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
mm512_shuflr128_32( _mm512_aesenc_epi128( K7, zero ) ) );
if ( r == 2 )
K7 = _mm512_xor_si512( K7, mm512_swap128_64(
_mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) );
_mm512_mask_ternarylogic_epi32( count, 0x2222, count, count, 1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P1 = _mm512_xor_si512( P1, X );
P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
// round 2, 6, 10
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), zero );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P2 = _mm512_xor_si512( P2, X );
P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P2 );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), zero );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P0 = _mm512_xor_si512( P0, X );
P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P0 );
// round 3, 7, 11
K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
_mm512_aesenc_epi128( K0, zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), zero );
K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
_mm512_aesenc_epi128( K1, zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
_mm512_aesenc_epi128( K2, zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P1 = _mm512_xor_si512( P1, X );
_mm512_aesenc_epi128( K3, zero ) ), K2 );
P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P1 );
K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
_mm512_aesenc_epi128( K4, zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), zero );
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
_mm512_aesenc_epi128( K5, zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
_mm512_aesenc_epi128( K6, zero ) ), K5 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P3 = _mm512_xor_si512( P3, X );
_mm512_aesenc_epi128( K7, zero ) ), K6 );
P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P3 );
// round 4, 8, 12
K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero );
K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P0 = _mm512_xor_si512( P0, X );
P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 );
K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero );
K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P2 = _mm512_xor_si512( P2, X );
P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 );
}
// round 13
K0 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
_mm512_aesenc_epi128( K0, zero ) ), K7 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero );
K1 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
_mm512_aesenc_epi128( K1, zero ) ), K0 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero );
K2 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
_mm512_aesenc_epi128( K2, zero ) ), K1 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero );
K3 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
P3 = _mm512_xor_si512( P3, X );
_mm512_aesenc_epi128( K3, zero ) ), K2 );
P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 );
K4 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
_mm512_aesenc_epi128( K4, zero ) ), K3 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero );
K5 = _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, mm512_swap64_32(
_mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
_mm512_aesenc_epi128( K5, zero ) ), K4 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero );
K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) );
K6 = mm512_xor3( K6, K5, mm512_swap64_32(
_mm512_mask_ternarylogic_epi32( count, 0x4444, count, count, 1 ) ) );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero );
K7= _mm512_xor_si512( mm512_shuflr128_32(
_mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
P1 = _mm512_xor_si512( P1, X );
_mm512_aesenc_epi128( K7, zero ) ), K6 );
P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 );
H[0] = _mm512_xor_si512( H[0], P2 );
H[1] = _mm512_xor_si512( H[1], P3 );

View File

@@ -1,159 +0,0 @@
#include "miner.h"
#include "algo-gate-api.h"
#include <string.h>
#include <stdint.h>
#include "sph_shavite.h"
extern void inkhash(void *state, const void *input)
{
sph_shavite512_context ctx_shavite;
uint32_t hash[16];
sph_shavite512_init(&ctx_shavite);
sph_shavite512 (&ctx_shavite, (const void*) input, 80);
sph_shavite512_close(&ctx_shavite, (void*) hash);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, (const void*) hash, 64);
sph_shavite512_close(&ctx_shavite, (void*) hash);
memcpy(state, hash, 32);
/*
int ii;
printf("result: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)state)[ii]);
};
printf ("\n");
*/
}
int scanhash_ink( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
int thr_id = mythr->id;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
//const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(32) hash64[8];
uint32_t endiandata[32];
//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
//we need bigendian data...
//lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd....
int kk=0;
for (; kk < 32; kk++)
{
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
};
// if (opt_debug)
// {
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
// }
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
if (ptarget[7]==0) {
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFFFFF)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFFFF0)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xFF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFFF00)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xFFF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFFF000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else if (ptarget[7]<=0xFFFF)
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (((hash64[7]&0xFFFF0000)==0) &&
fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
else
{
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
inkhash(hash64, endiandata);
if (fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
} while (n < max_nonce && !work_restart[thr_id].restart);
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
bool register_shavite_algo( algo_gate_t* gate )
{
algo_not_implemented();
return false;
// gate->scanhash = (void*)&scanhash_ink;
// gate->hash = (void*)&inkhash;
// return true;
};

View File

@@ -50,7 +50,8 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
static const sph_u32 IV512[] = {
static const sph_u32 IV512[] =
{
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
@@ -71,38 +72,26 @@ c512( sph_shavite_big_context *sc, const void *msg )
p2 = h[2];
p3 = h[3];
// round
k00 = m[0];
x = v128_xor( p1, k00 );
x = v128_aesenc_nokey( x );
k01 = m[1];
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = m[2];
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
k03 = m[3];
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
p0 = v128_xor( p0, x );
k10 = m[4];
x = v128_xor( p3, k10 );
x = v128_aesenc_nokey( x );
k11 = m[5];
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
k12 = m[6];
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
k13 = m[7];
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p2 = v128_xor( p2, x );
// round 0
x = v128_xoraesenc( p1, k00 );
x = v128_xoraesenc( x, k01 );
x = v128_xoraesenc( x, k02 );
p0 = v128_xoraesencxor( x, k03, p0 );
x = v128_xoraesenc( p3, k10 );
x = v128_xoraesenc( x, k11 );
x = v128_xoraesenc( x, k12 );
p2 = v128_xoraesencxor( x, k13, p2 );
for ( r = 0; r < 3; r ++ )
{
@@ -113,198 +102,165 @@ c512( sph_shavite_big_context *sc, const void *msg )
if ( r == 0 )
k00 = v128_xor( k00, v128_set32(
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = v128_xoraesenc( p0, k00 );
x = v128_xor( p0, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
if ( r == 1 )
k01 = v128_xor( k01, v128_set32(
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = v128_xoraesenc( x, k01 );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k02 );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
p3 = v128_xoraesencxor( x, k03, p3 );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xoraesenc( p2, k10 );
x = v128_xor( p2, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k11 );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k12 );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
if ( r == 2 )
k13 = v128_xor( k13, v128_set32(
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
p1 = v128_xoraesencxor( x, k13, p1 );
// round 2, 6, 10
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p3, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p3, k00 );
p2 = v128_xor( p2, x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xoraesenc( x, k01 );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xoraesenc( x, k02 );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
p2 = v128_xoraesencxor( x, k03, p2 );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p1, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p1, k10 );
p0 = v128_xor( p0, x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xoraesenc( x, k11 );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xoraesenc( x, k12 );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
p0 = v128_xoraesencxor( x, k13, p0 );
// round 3, 7, 11
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p2, k00 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p2, k00 );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k01 );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k02 );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
p1 = v128_xoraesencxor( x, k03, p1 );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p0, k10 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p0, k10 );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k11 );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k12 );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
p3 = v128_xoraesencxor( x, k13, p3 );
// round 4, 8, 12
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p1, k00 );
x = v128_aesenc_nokey( x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p1, k00 );
p0 = v128_xor( p0, x );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xoraesenc( x, k01 );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xoraesenc( x, k02 );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
p0 = v128_xoraesencxor( x, k03, p0 );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p3, k10 );
x = v128_aesenc_nokey( x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p3, k10 );
p2 = v128_xor( p2, x );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xoraesenc( x, k11 );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xoraesenc( x, k12 );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
p2 = v128_xoraesencxor( x, k13, p2 );
}
// round 13
k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p0, k00 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p0, k00 );
k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k01 );
k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k02 );
k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc_nokey( x );
p3 = v128_xor( p3, x );
p3 = v128_xoraesencxor( x, k03, p3 );
k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p2, k10 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( p2, k10 );
k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k11 );
k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) );
k12 = v128_xor( k12, v128_xor( k11, v128_set32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = v128_xor( x, k12 );
x = v128_aesenc_nokey( x );
x = v128_xoraesenc( x, k12 );
k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc_nokey( x );
p1 = v128_xor( p1, x );
p1 = v128_xoraesencxor( x, k13, p1 );
h[0] = v128_xor( h[0], p2 );
h[1] = v128_xor( h[1], p3 );

View File

@@ -171,6 +171,53 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
{{ -30, 55, -58, -65, -95, -40, -98, 94 }},
};
#if defined(__AVX2__)
static const __m256i V256_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff,
0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#define V128_00FF _mm256_castsi256_si128( V256_00FF )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_00FF = { 0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff };
#endif
#if defined(SIMD512)
static const __m512i V512_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V256_0101 _mm512_castsi512_si256( V512_0101 )
#define V128_0101 _mm512_castsi512_si128( V512_0101 )
static const __m512i V512_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V256_0080 _mm512_castsi512_si256( V512_0080 )
#define V128_0080 _mm512_castsi512_si128( V512_0080 )
#elif defined(__AVX2__)
static const __m256i V256_0101 = { 0x0101010101010101, 0x0101010101010101,
0x0101010101010101, 0x0101010101010101 };
#define V128_0101 _mm256_castsi256_si128( V256_0101 )
static const __m256i V256_0080 = { 0x0080008000800080, 0x0080008000800080,
0x0080008000800080, 0x0080008000800080 };
#define V128_0080 _mm256_castsi256_si128( V256_0080 )
#elif defined(__SSE2__) || defined(__ARM_NEON )
static const v128u64_t V128_0101 = { 0x0101010101010101, 0x0101010101010101 };
static const v128u64_t V128_0080 = { 0x0080008000800080, 0x0080008000800080 };
#endif
#if defined(__x86_64__)
#define SHUFXOR_1(x) _mm_shuffle_epi32(x,0xb1)
@@ -190,13 +237,10 @@ static const m128_v16 FFT256_twiddle[] __attribute__((aligned(64))) =
#define shufxor(x,s) XCAT(SHUFXOR_,s)(x)
#define REDUCE(x) \
v128_sub16( v128_and( x, v128_64( \
0x00ff00ff00ff00ff ) ), v128_sra16( x, 8 ) )
v128_sub16( v128_and( x, V128_00FF ), v128_sra16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\
v128_sub16( x, v128_and( \
v128_64( 0x0101010101010101 ), \
v128_cmpgt16( x, v128_64( 0x0080008000800080 ) ) ) )
v128_sub16( x, v128_and( V128_0101, v128_cmpgt16( x, V128_0080 ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -293,10 +337,9 @@ do { \
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
do { \
v128u16_t t1= X(i); \
v128u16_t t2= X(j); \
X(i) = v128_unpacklo16( t1, t2 ); \
X(j) = v128_unpackhi16( t1, t2 ); \
v128u16_t t = X(i); \
X(i) = v128_unpacklo16( t, X(j) ); \
X(j) = v128_unpackhi16( t, X(j) ); \
} while(0)
INTERLEAVE( 1, 0 );
@@ -803,23 +846,12 @@ static const m256_v16 FFT256_Twiddle[] =
#define shufxor2w(x,s) XCAT(SHUFXOR_,s)(x)
#if defined(VL256)
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
_mm256_srai_epi16( x, 8 ) )
#else
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
#endif
_mm256_sub_epi16( _mm256_and_si256( x, V256_00FF ), _mm256_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \
_mm256_set1_epi64x( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )
_mm256_sub_epi16( x, _mm256_and_si256( V256_0101, \
_mm256_cmpgt_epi16( x, V256_0080 ) ) )
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )
@@ -917,10 +949,9 @@ do { \
// This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
do { \
__m256i t1= X(i); \
__m256i t2= X(j); \
X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
__m256i t = X(i); \
X(i) = _mm256_unpacklo_epi16( t, X(j) ); \
X(j) = _mm256_unpackhi_epi16( t, X(j) ); \
} while(0)
INTERLEAVE( 1, 0 );
@@ -1658,10 +1689,8 @@ static const m512_v16 FFT256_Twiddle4w[] =
_mm512_srai_epi16( x, 8 ) )
#define EXTRA_REDUCE_S4w(x) \
_mm512_sub_epi16( x, _mm512_and_si512( \
_mm512_set1_epi64( 0x0101010101010101 ), \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )
_mm512_sub_epi16( x, _mm512_and_si512( V512_0101, \
_mm512_movm_epi16( _mm512_cmpgt_epi16_mask( x, V512_0080 ) ) ) )
// generic, except it calls targetted macros
#define REDUCE_FULL_S4w( x ) EXTRA_REDUCE_S4w( REDUCE4w (x ) )

View File

@@ -640,24 +640,25 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
#if defined(__AVX2__)
__m256i F0, F1, F2, F3, F4, F5, F6, F7;
__m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
__m256i *table = (__m256i*)fftTable;
__m256i tbl = table[ input[0] ];
__m256i *mul = (__m256i*)multipliers;
__m256i *out = (__m256i*)output;
F0 = _mm256_mullo_epi32( mul[0], tbl );
tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
tbl = table[ input[1] ];
F1 = _mm256_mullo_epi32( mul[1], tbl );
tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
tbl = table[ input[2] ];
F2 = _mm256_mullo_epi32( mul[2], tbl );
tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
tbl = table[ input[3] ];
F3 = _mm256_mullo_epi32( mul[3], tbl );
tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
tbl = table[ input[4] ];
F4 = _mm256_mullo_epi32( mul[4], tbl );
tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
tbl = table[ input[5] ];
F5 = _mm256_mullo_epi32( mul[5], tbl );
tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
tbl = table[ input[6] ];
F6 = _mm256_mullo_epi32( mul[6], tbl );
tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
tbl = table[ input[7] ];
F7 = _mm256_mullo_epi32( mul[7], tbl );
#define ADD_SUB( a, b ) \
@@ -677,9 +678,9 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
ADD_SUB( F1, F3 );
ADD_SUB( F4, F6 );
ADD_SUB( F5, F7 );
F5 = _mm256_slli_epi32( F5, 2 );
F6 = _mm256_slli_epi32( F6, 4 );
F7 = _mm256_slli_epi32( F7, 6 );
F5 = _mm256_slli_epi32( F5, 2 );
ADD_SUB( F0, F4 );
ADD_SUB( F1, F5 );
ADD_SUB( F2, F6 );

View File

@@ -4,11 +4,11 @@
# during development. However, the information contained may provide compilation
# tips to users.
rm cpuminer cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null
rm cpuminer cpuminer-m2 cpuminer-m4 cpuminer-armv9-crypto-sha3 cpuminer-armv9-crypto cpuminer-armv9 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8 cpuminer-armv8-crypto > /dev/null
# armv9 needs gcc-13
# -march-armv9-a includes SVE2 but no crypto
# -march=armv9-a+crypto adds AES & SHA2 but not SHA512
# -march=armv9-a+crypto adds AES & SHA256 but not SHA512
make distclean || echo clean
rm -f config.status
@@ -27,18 +27,37 @@ CFLAGS="-O3 -march=armv9-a -Wall -flax-vector-conversions" ./configure --with-c
make -j $(nproc)
mv cpuminer cpuminer-armv9
# Apple M4: armv9.2, AES, SHA3, SVE2
make clean || echo clean
CFLAGS="-O3 -march=armv9.2-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m4
# Apple M2: armv8.6, AES, SHA3
make clean || echo clean
CFLAGS="-O3 -march=armv8.6-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-m2
# SVE2 available in armv8.5
make clean || echo clean
CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+sve2 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8.5-crypto-sha3-sve2
# SHA3 available in armv8.4
# Apple M1: armv8.4 AES, SHA3
make clean || echo clean
CFLAGS="-O3 -march=armv8.4-a+crypto+sha3 -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8.4-crypto-sha3
# Cortex-A76 (Orange Pi 5)
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8.2-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl
make -j $(nproc)
mv cpuminer cpuminer-armv8-crypto
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=armv8-a+crypto -Wall -flax-vector-conversions" ./configure --with-curl

View File

@@ -34,9 +34,7 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-arrowlake-s
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
# Granitrapids does not build with AVX10, SHA512 or APX.
# wait for Diamondrapids & gcc-15.
# Intel Core Graniterapids: AVX512, SHA256, VAES, AMX, needs gcc-14
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
@@ -44,13 +42,13 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-graniterapids
# SHA512 AVX10.1
#make clean || echo clean
#rm -f config.status
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
#make -j $(nproc)
#strip -s cpuminer
#mv cpuminer cpuminer-avx10_1
# Graniterapids + SHA512, AVX10.1
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1 -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx10.1
# SHA512 AVX10.2
#make clean || echo clean
@@ -72,6 +70,9 @@ mv cpuminer cpuminer-graniterapids
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
# zen4 is close enough for older compiler
#CFLAGS="-O3 -march=znver4 -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-zen5
@@ -138,13 +139,21 @@ make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-avx
# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
# SSE4.2 AES SHA: Intel Atom Goldmont, newer Pentium & Celeron
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=goldmont -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-sse42-aes-sha
# SSE4.2 AES: Intel Westmere, older Pentium & Celeron
make clean || echo clean
rm -f config.status
CFLAGS="-O3 -march=westmere -maes -Wall" ./configure --with-curl
make -j $(nproc)
strip -s cpuminer
mv cpuminer cpuminer-aes-sse42
mv cpuminer cpuminer-sse42-aes
# SSE4.2: Intel Nehalem
make clean || echo clean

View File

@@ -2,8 +2,8 @@
#
# make clean and rm all the targetted executables.
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen* cpuminer-x64 cpuminer-armv* > /dev/null
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512* cpuminer-alderlake cpuminer-avx10* cpuminer-avx2* cpuminer-avx cpuminer-sse* cpuminer-ssse3 cpuminer-zen* cpuminer-x64 cpuminer-armv* cpuminer-m2 cpuminer-m4 > /dev/null
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
rm cpuminer-avx512* cpuminer-avx2* cpuminer-avx.exe cpuminer-sse* cpuminer-zen* cpuminer-x64.exe > /dev/null
make distclean > /dev/null

View File

@@ -108,7 +108,24 @@ extern "C"{
} while (0)
#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
{ \
(Y0) = AES0[(X0) & 0xFF] \
^ AES1[((X1) >> 8) & 0xFF] \
^ AES2[((X2) >> 16) & 0xFF] \
^ AES3[((X3) >> 24) & 0xFF]; \
(Y1) = AES0[(X1) & 0xFF] \
^ AES1[((X2) >> 8) & 0xFF] \
^ AES2[((X3) >> 16) & 0xFF] \
^ AES3[((X0) >> 24) & 0xFF]; \
(Y2) = AES0[(X2) & 0xFF] \
^ AES1[((X3) >> 8) & 0xFF] \
^ AES2[((X0) >> 16) & 0xFF] \
^ AES3[((X1) >> 24) & 0xFF]; \
(Y3) = AES0[(X3) & 0xFF] \
^ AES1[((X0) >> 8) & 0xFF] \
^ AES2[((X1) >> 16) & 0xFF] \
^ AES3[((X2) >> 24) & 0xFF]; \
}
#endif

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.5.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.7.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.5'
PACKAGE_STRING='cpuminer-opt 25.5'
PACKAGE_VERSION='25.7'
PACKAGE_STRING='cpuminer-opt 25.7'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1431,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.5:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
esac
cat <<\_ACEOF
@@ -1536,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 25.5
cpuminer-opt configure 25.7
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.5, which was
It was created by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3591,7 +3591,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='25.5'
VERSION='25.7'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 25.5, which was
This file was extended by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 25.5
cpuminer-opt config.status 25.7
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [25.5])
AC_INIT([cpuminer-opt], [25.7])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.5.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.7.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='25.5'
PACKAGE_STRING='cpuminer-opt 25.5'
PACKAGE_VERSION='25.7'
PACKAGE_STRING='cpuminer-opt 25.7'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
'configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems.
'configure' configures cpuminer-opt 25.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1424,7 +1424,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 25.5:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.7:";;
esac
cat <<\_ACEOF
@@ -1528,7 +1528,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 25.5
cpuminer-opt configure 25.7
generated by GNU Autoconf 2.72
Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 25.5, which was
It was created by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.72. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3065,7 +3065,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h"
am__api_version='1.17'
am__api_version='1.18'
# Find a good install program. We prefer a C program (faster),
@@ -3334,10 +3334,14 @@ am_lf='
'
case `pwd` in
*[\\\"\#\$\&\'\`$am_lf]*)
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
esac
case $srcdir in
*[\\\"\#\$\&\'\`$am_lf\ \ ]*)
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;;
esac
@@ -3764,7 +3768,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='25.5'
VERSION='25.7'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -3802,9 +3806,133 @@ AMTAR='$${TAR-tar}'
# We'll loop over all known methods to create a tar archive until one works.
_am_tools='gnutar pax cpio none'
_am_tools='gnutar plaintar pax cpio none'
am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
# The POSIX 1988 'ustar' format is defined with fixed-size fields.
# There is notably a 21 bits limit for the UID and the GID. In fact,
# the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
# and bug#13588).
am_max_uid=2097151 # 2^21 - 1
am_max_gid=$am_max_uid
# The $UID and $GID variables are not portable, so we need to resort
# to the POSIX-mandated id(1) utility. Errors in the 'id' calls
# below are definitely unexpected, so allow the users to see them
# (that is, avoid stderr redirection).
am_uid=`id -u || echo unknown`
am_gid=`id -g || echo unknown`
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether UID '$am_uid' is supported by ustar format" >&5
printf %s "checking whether UID '$am_uid' is supported by ustar format... " >&6; }
if test x$am_uid = xunknown; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&5
printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&2;}
elif test $am_uid -le $am_max_uid; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
_am_tools=none
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GID '$am_gid' is supported by ustar format" >&5
printf %s "checking whether GID '$am_gid' is supported by ustar format... " >&6; }
if test x$gm_gid = xunknown; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&5
printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&2;}
elif test $am_gid -le $am_max_gid; then
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
printf "%s\n" "yes" >&6; }
else
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
printf "%s\n" "no" >&6; }
_am_tools=none
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to create a ustar tar archive" >&5
printf %s "checking how to create a ustar tar archive... " >&6; }
# Go ahead even if we have the value already cached. We do so because we
# need to set the values for the 'am__tar' and 'am__untar' variables.
_am_tools=${am_cv_prog_tar_ustar-$_am_tools}
for _am_tool in $_am_tools; do
case $_am_tool in
gnutar)
for _am_tar in tar gnutar gtar; do
{ echo "$as_me:$LINENO: $_am_tar --version" >&5
($_am_tar --version) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); } && break
done
am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
am__untar="$_am_tar -xf -"
;;
plaintar)
# Must skip GNU tar: if it does not support --format= it doesn't create
# ustar tarball either.
(tar --version) >/dev/null 2>&1 && continue
am__tar='tar chf - "$$tardir"'
am__tar_='tar chf - "$tardir"'
am__untar='tar xf -'
;;
pax)
am__tar='pax -L -x ustar -w "$$tardir"'
am__tar_='pax -L -x ustar -w "$tardir"'
am__untar='pax -r'
;;
cpio)
am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
am__untar='cpio -i -H ustar -d'
;;
none)
am__tar=false
am__tar_=false
am__untar=false
;;
esac
# If the value was cached, stop now. We just wanted to have am__tar
# and am__untar set.
test -n "${am_cv_prog_tar_ustar}" && break
# tar/untar a dummy directory, and stop if the command works.
rm -rf conftest.dir
mkdir conftest.dir
echo GrepMe > conftest.dir/file
{ echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
(tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }
rm -rf conftest.dir
if test -s conftest.tar; then
{ echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
($am__untar <conftest.tar) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }
{ echo "$as_me:$LINENO: cat conftest.dir/file" >&5
(cat conftest.dir/file) >&5 2>&5
ac_status=$?
echo "$as_me:$LINENO: \$? = $ac_status" >&5
(exit $ac_status); }
grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
fi
done
rm -rf conftest.dir
if test ${am_cv_prog_tar_ustar+y}
then :
printf %s "(cached) " >&6
else case e in #(
e) am_cv_prog_tar_ustar=$_am_tool ;;
esac
fi
{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_tar_ustar" >&5
printf "%s\n" "$am_cv_prog_tar_ustar" >&6; }
@@ -4986,7 +5114,10 @@ _ACEOF
break
fi
done
rm -f core conftest*
# aligned with autoconf, so not including core; see bug#72225.
rm -f -r a.out a.exe b.out conftest.$ac_ext conftest.$ac_objext \
conftest.dSYM conftest1.$ac_ext conftest1.$ac_objext conftest1.dSYM \
conftest2.$ac_ext conftest2.$ac_objext conftest2.dSYM
unset am_i ;;
esac
fi
@@ -7450,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 25.5, which was
This file was extended by cpuminer-opt $as_me 25.7, which was
generated by GNU Autoconf 2.72. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7518,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 25.5
cpuminer-opt config.status 25.7
configured by $0, generated by GNU Autoconf 2.72,
with options \\"\$ac_cs_config\\"

View File

@@ -921,39 +921,32 @@ out:
return rc;
}
// returns the unit prefix and the hashrate appropriately scaled.
void scale_hash_for_display ( double* hashrate, char* prefix )
{
if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
else { *prefix = 'Y'; *hashrate /= 1e24; }
}
// Does not account for leap years.
static inline void sprintf_et( char *str, long unsigned int seconds )
{
long unsigned int min = seconds / 60;
long unsigned int sec = seconds % 60;
long unsigned int hrs = min / 60;
if ( unlikely( hrs ) )
long unsigned int minutes = seconds / 60;
if ( minutes )
{
long unsigned int days = hrs / 24;
long unsigned int years = days / 365;
if ( years ) // 0y000d
sprintf( str, "%luy%lud", years, years % 365 );
else if ( days ) // 0d00h
sprintf( str, "%lud%02luh", days, hrs % 24 );
else // 0h00m
sprintf( str, "%luh%02lum", hrs, min % 60 );
long unsigned int hours = minutes / 60;
if ( hours )
{
long unsigned int days = hours / 24;
if ( days )
{
long unsigned int years = days / 365;
if ( years )
sprintf( str, "%luy%03lud", years, days % 365 ); // 0y000d
else
sprintf( str, "%lud%02luh", days, hours % 24 ); // 0d00h
}
else
sprintf( str, "%luh%02lum", hours, minutes % 60 ); // 0h00m
}
else
sprintf( str, "%lum%02lus", minutes, seconds % 60 ); // 0m00s
}
else // 0m00s
sprintf( str, "%lum%02lus", min, sec );
else
sprintf( str, "%lus", seconds ); // 0s
}
const long double exp32 = EXP32; // 2**32
@@ -2833,67 +2826,29 @@ static void show_credits()
static bool cpu_capability( bool display_only )
{
char cpu_brand[0x40];
bool cpu_has_sse2 = has_sse2(); // X86_64 only
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
bool cpu_has_sse41 = has_sse41(); // X86_64 only
bool cpu_has_sse42 = has_sse42();
bool cpu_has_avx = has_avx();
bool cpu_has_neon = has_neon(); // AArch64
bool cpu_has_sve = has_sve(); // aarch64 only, insignificant
bool cpu_has_sve2 = has_sve2(); // AArch64 only
bool cpu_has_sme = has_sme();
bool cpu_has_sme2 = has_sme2();
bool cpu_has_avx2 = has_avx2();
bool cpu_has_avx512 = has_avx512();
bool cpu_has_avx10 = has_avx10();
bool cpu_has_aes = has_aes(); // x86_64 or AArch64
bool cpu_has_vaes = has_vaes(); // X86_64 only
bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64
bool cpu_has_sha512 = has_sha512();
bool sw_has_x86_64 = false;
bool sw_has_aarch64 = false;
int sw_arm_arch = 0; // AArch64 version
bool sw_has_neon = false; // AArch64
bool sw_has_sve = false; // AArch64
bool sw_has_sve2 = false; // AArch64
bool sw_has_sve = false;
bool sw_has_sve2 = false;
bool sw_has_sme = false;
bool sw_has_sme2 = false;
bool sw_has_sse2 = false; // x86_64
bool sw_has_ssse3 = false; // x86_64
bool sw_has_sse41 = false; // x86_64
bool sw_has_ssse3 = false;
bool sw_has_sse41 = false;
bool sw_has_sse42 = false;
bool sw_has_avx = false;
bool sw_has_avx2 = false;
bool sw_has_avx512 = false;
bool sw_has_avx10 = false;
bool sw_has_aes = false;
bool sw_has_vaes = false;
bool sw_has_amx = false;
bool sw_has_apx = false;
bool sw_has_aes = false; // x86_64 or AArch64
bool sw_has_vaes = false; // x86_64
bool sw_has_sha256 = false; // x86_64 or AArch64
bool sw_has_sha512 = false; // x86_64 or AArch64
/*
set_t algo_features = algo_gate.optimizations;
bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features );
bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features );
bool algo_has_avx = set_incl( AVX_OPT, algo_features );
bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features );
bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features );
bool algo_has_aes = set_incl( AES_OPT, algo_features );
bool algo_has_vaes = set_incl( VAES_OPT, algo_features );
bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features );
bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features );
bool algo_has_neon = set_incl( NEON_OPT, algo_features );
bool use_sse2;
bool use_sse42;
bool use_avx;
bool use_avx2;
bool use_avx512;
bool use_aes;
bool use_vaes;
bool use_sha256;
bool use_sha512;
bool use_neon;
bool use_none;
*/
bool sw_has_sha512 = false;
#if defined(__x86_64__)
sw_has_x86_64 = true;
#elif defined(__aarch64__)
@@ -2928,14 +2883,15 @@ static bool cpu_capability( bool display_only )
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__))
sw_has_avx512 = true;
#endif
// AVX10 version is not significant as of AVX10.2. If that changes use a better
// way to test the version than sequentially.
// #if defined(__AVX10_2__)
//
// #elif defined(__AVX10_1__)
#if defined(__AVX10_1__)
#if defined(__AVX10_1__) // version is not significant
sw_has_avx10 = true;
#endif
#ifdef __AMX_TILE__
sw_has_amx = true;
#endif
#ifdef __APX_F__
sw_has_apx = true;
#endif
// x86_64 or AArch64
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
@@ -2955,6 +2911,7 @@ static bool cpu_capability( bool display_only )
#if defined(__ARM_NEON)
sw_has_neon = true;
#endif
// FYI, SVE & SME not used by cpuminer
#if defined(__ARM_FEATURE_SVE)
sw_has_sve = true;
#endif
@@ -2975,8 +2932,7 @@ static bool cpu_capability( bool display_only )
// Build
printf( "SW built on " __DATE__
#if defined(__clang__)
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__,
__clang_patchlevel__ );
" with CLANG-%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__ );
#elif defined(__GNUC__)
" with GCC-%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
#endif
@@ -3002,27 +2958,30 @@ static bool cpu_capability( bool display_only )
printf("CPU features: ");
if ( cpu_arch_x86_64() )
{
if ( cpu_has_avx10 ) printf( " AVX10.%d", avx10_version() );
if ( cpu_has_avx512 ) printf( " AVX512" );
else if ( cpu_has_avx2 ) printf( " AVX2 " );
else if ( cpu_has_avx ) printf( " AVX " );
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
else if ( cpu_has_sse2 ) printf( " SSE2 " );
if ( has_avx10() ) printf( " AVX10.%d", avx10_version() );
else if ( has_avx512() ) printf( " AVX512" );
else if ( has_avx2() ) printf( " AVX2 " );
else if ( has_avx() ) printf( " AVX " );
else if ( has_sse42() ) printf( " SSE4.2" );
else if ( has_sse41() ) printf( " SSE4.1" );
else if ( has_ssse3() ) printf( " SSSE3 " );
else if ( has_sse2() ) printf( " SSE2 " );
if ( has_amx() ) printf( " AMX" );
if ( has_apx_f() ) printf( " APX" );
}
else if ( cpu_arch_aarch64() )
{
if ( cpu_has_neon ) printf( " NEON" );
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() );
else if ( cpu_has_sve ) printf( " SVE" );
if ( cpu_has_sme2 ) printf( " SME2" );
else if ( cpu_has_sme ) printf( " SME" );
if ( has_neon() ) printf( " NEON" );
if ( has_sve2() ) printf( " SVE2-%d", sve_vector_length() );
else if ( has_sve() ) printf( " SVE" );
if ( has_sme2() ) printf( " SME2" );
else if ( has_sme() ) printf( " SME" );
}
if ( cpu_has_vaes ) printf( " VAES" );
else if ( cpu_has_aes ) printf( " AES" );
if ( cpu_has_sha512 ) printf( " SHA512" );
else if ( cpu_has_sha256 ) printf( " SHA256" );
if ( has_vaes() ) printf( " VAES" );
else if ( has_aes() ) printf( " AES" );
if ( has_sha512() ) printf( " SHA512" );
else if ( has_sha256() ) printf( " SHA256" );
printf("\nSW features: ");
if ( sw_has_x86_64 )
@@ -3035,6 +2994,8 @@ static bool cpu_capability( bool display_only )
else if ( sw_has_sse41 ) printf( " SSE4.1" );
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
else if ( sw_has_sse2 ) printf( " SSE2 " );
if ( sw_has_amx ) printf( " AMX" );
if ( sw_has_apx ) printf( " APX" );
}
else if ( sw_has_aarch64 )
{

View File

@@ -542,7 +542,9 @@ void applog_hash(void *hash);
void format_hashrate(double hashrate, char *output);
void print_hash_tests(void);
// Factors of 1000 used for hashes, ie kH/s, Mh/s.
void scale_hash_for_display ( double* hashrate, char* units );
// Factors of 1024 used for bytes, ie kiB, MiB.
void format_number_si( double* hashrate, char* si_units );
void report_summary_log( bool force );
@@ -582,6 +584,8 @@ enum algos {
ALGO_ANIME,
ALGO_ARGON2D250,
ALGO_ARGON2D500,
ALGO_ARGON2D1000,
ALGO_ARGON2D16000,
ALGO_ARGON2D4096,
ALGO_AXIOM,
ALGO_BLAKE,
@@ -677,6 +681,8 @@ static const char* const algo_names[] = {
"anime",
"argon2d250",
"argon2d500",
"argon2d1000",
"argon2d16000",
"argon2d4096",
"axiom",
"blake",
@@ -837,6 +843,8 @@ Options:\n\
anime Animecoin (ANI)\n\
argon2d250\n\
argon2d500\n\
argon2d1000\n\
argon2d16000\n\
argon2d4096\n\
axiom Shabal-256 MemoHash\n\
blake blake256r14 (SFR)\n\

View File

@@ -137,10 +137,24 @@
#define v128_unpackhi8 _mm_unpackhi_epi8
// AES
// Nokey means nothing on x86_64 but it saves an instruction and a register
// on ARM.
#define v128_aesenc _mm_aesenc_si128
// xor key with result after encryption, x86_64 format.
#define v128_aesencxor _mm_aesenc_si128
// default is x86_64 format.
#define v128_aesenc v128_aesencxor
// xor key with v before encryption, arm64 format.
#define v128_xoraesenc( v, k ) \
_mm_aesenc_si128( v128_xor( v, k ), v128_zero )
// xor v with k_in before encryption then xor the result with k_out afterward.
// Uses the applicable optimization based on the target.
#define v128_xoraesencxor( v, k_in, k_out ) \
_mm_aesenc_si128( v128_xor( v, k_in ), k_out )
// arm64 optimized
#define v128_aesenc_nokey(v) _mm_aesenc_si128( v, v128_zero )
#define v128_aesenclast _mm_aesenclast_si128
#define v128_aesenclast_nokey(v) _mm_aesenclast_si128( v, v128_zero )
#define v128_aesdec _mm_aesdec_si128
@@ -433,7 +447,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#define v128_nxor( a, b ) _mm_ternarylogic_epi64( a, b, b, 0x81 )
#else
@@ -455,7 +469,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) )
#define v128_nxor( a, b ) v128_not( _mm_xor_si128( a, b ) )
#endif

View File

@@ -170,7 +170,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 )
// ~( a ^ b ), same as (~a) ^ b
#define mm256_xnor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 )
#define mm256_nxor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 )
#else
@@ -208,7 +208,7 @@ static inline __m256i mm256_not( const __m256i v )
#define mm256_orand( a, b, c ) \
_mm256_or_si256( a, _mm256_and_si256( b, c ) )
#define mm256_xnor( a, b ) \
#define mm256_nxor( a, b ) \
mm256_not( _mm256_xor_si256( a, b ) )
#endif

View File

@@ -18,8 +18,13 @@
// AVX512 intrinsics have a few changes from previous conventions.
//
// "_mm512_cmp" instructions now returns a bitmask instead of a vector mask.
// This removes the need for an explicit movemask instruction.
// "_mm512_cmp" instructions now return a bitmask instead of a vector mask.
// This removes the need for an explicit movemask instruction. It is also
// slower than AVX2 cmp. There is no version of AVX512 cmp that returns a
// vector result resulting in a double penalty if a vector result is needed:
// slower cmp instruction & extra instruction to convert bit mask into
// vector mask. 256 bit & 128 bit still have legacy cmp returning vector
// while AVX512VL adds masked versions returning bit mask.
//
// Many previously sizeless (si) instructions now have sized (epi) versions
// to accomodate masking packed elements.
@@ -30,7 +35,7 @@
// list.
//
// "_mm512_permutex_epi64" only shuffles within 256 bit lanes. All other
// AVX512 permutes can cross all lanes.
// AVX512 instructions using the permute name can cross all lanes.
//
// New alignr instructions for epi64 and epi32 operate across the entire
// vector but slower than epi8 which continues to be restricted to 128 bit
@@ -50,16 +55,17 @@
// parentheses to ensure the expression argument is evaluated first.
// - if an argument is to referenced multiple times a C inline function
// should be used instead of a macro to prevent an expression argument
// from being evaluated multiple times (wasteful) or produces side
// from being evaluated multiple times (wasteful) or produce side
// effects (very bad).
//
// There are 2 areas where overhead is a major concern: constants and
// permutations.
//
// Constants need to be composed at run time by assembling individual
// elements, very expensive. The cost is proportional to the number of
// different elements therefore use the largest element size possible,
// merge smaller integer elements to 64 bits, and group repeated elements.
// elements or loaded from memory, very expensive. The cost of runtime
// construction is proportional to the number of different elements
// therefore use the largest element size possible merging smaller integer
// elements to 64 bits, and group repeated elements.
//
// Constants with repeating patterns can be optimized with the smaller
// patterns repeated more frequently being more efficient.
@@ -67,14 +73,15 @@
// Some specific constants can be very efficient. Zero is very efficient,
// 1 and -1 slightly less so.
//
// If an expensive constant is to be reused in the same function it should
// be declared as a local variable defined once and reused.
// If an expensive constant is to be reused in the same function it may
// be declared as a local variable defined once and reused. If frequently
// used it can be declared as a static const in memory.
//
// Permutations can be very expensive if they use a vector control index,
// even if the permutation itself is quite efficient.
// The index is essentially a constant with all the baggage that brings.
// The same rules apply, if an index is to be reused it should be defined
// as a local. This applies specifically to bswap operations.
// even if the permute instruction itself is quite efficient.
// The index is essentially a vector constant with all the baggage that
// brings. The same rules apply, if an index is to be reused it should either
// be defined as a local or static const.
//
// Permutations that cross 128 bit lanes are typically slower and often need
// a vector control index. If the permutation doesn't need to cross 128 bit
@@ -221,7 +228,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 )
// ~( a ^ b ), (~a) ^ b
#define mm512_xnor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 )
#define mm512_nxor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 )
// ~( a & b )
#define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef )
@@ -241,6 +248,15 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_ror_32 _mm512_ror_epi32
#define mm512_rol_32 _mm512_rol_epi32
/* not used
#if defined(__AVX512VBMI2__)
#define mm512_ror_16( v, c ) _mm512_shrdi_epi16( c, v, v )
#define mm512_rol_16( v, c ) _mm512_shldi_epi16( c, v, v )
#endif
*/
//
// Reverse byte order of packed elements, vectorized endian conversion.
@@ -249,9 +265,17 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
#define mm512_bswap_32( v ) _mm512_shuffle_epi8( v, V512_BSWAP32 )
/* not used
#if defined(__AVX512VBMI2__)
#define mm512_bswap_16( v ) mm512_ror_16( v, 8 )
#else
#define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, mm512_bcast128( _mm_set_epi64x( \
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
0x0e0f0c0d0a0b0809, 0x0607040502030001 ) ) )
#endif
*/
#define mm512_bswap_16( v ) \
@@ -431,8 +455,7 @@ static inline __m512i mm512_shuflr128_x8( const __m512i v, const int c )
_mm512_castsi512_ps( v2 ), c ) );
// 64 bit lanes
// ROL, ROR not necessary with AVX512, included for consistency with AVX2/SSE.
// Redundant with ror & rol but included for consistency with AVX2/SSE.
#define mm512_qrev32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_swap64_32 mm512_qrev32 // grandfathered

View File

@@ -126,7 +126,7 @@
#define v128_andnot( v1, v0 ) vbicq_u32( v0, v1 )
// ~( v1 ^ v0 ), same as (~v1) ^ v0
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
#define v128_nxor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, args reversed for consistency with x86_64
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
@@ -187,9 +187,21 @@
// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.
// AES
// consistent with Intel AES intrinsics, break up for optimizing
#define v128_aesenc( v, k ) \
v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) )
// xor key with result after encryption, x86_64 format.
#define v128_aesencxor( v, k ) \
v128_xor( vaesmcq_u8( vaeseq_u8( v, v128_zero ) ), k )
// default is x86_64 format.
#define v128_aesenc v128_aesencxor
// xor key with v before encryption, arm64 format.
#define v128_xoraesenc( v, k ) \
vaesmcq_u8( vaeseq_u8( v, k ) )
// xor v with k_in before encryption then xor the result with k_out afterward.
// Uses the applicable optimization based on the target.
#define v128_xoraesencxor( v, k_in, k_out ) \
v128_xor( v128_xoraesenc( v, k_in ), k_out )
#define v128_aesenc_nokey( v ) \
vaesmcq_u8( vaeseq_u8( v, v128_zero ) )
@@ -337,52 +349,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
vqtbl1q_u8( (uint8x16_t)(v), (uint8x16_t)(vmask) )
// Bit rotation
/*
#define v128_shuflr64_8( v ) v128_shuffle8( v, V128_SHUFLR64_8 )
#define v128_shufll64_8( v ) v128_shuffle8( v, V128_SHUFLL64_8 )
#define v128_shuflr64_16(v ) v128_shuffle8( v, V128_SHUFLR64_16 )
#define v128_shufll64_16(v ) v128_shuffle8( v, V128_SHUFLL64_16 )
#define v128_shuflr64_24(v ) v128_shuffle8( v, V128_SHUFLR64_24 )
#define v128_shufll64_24(v ) v128_shuffle8( v, V128_SHUFLL64_24 )
#define v128_shuflr32_8( v ) v128_shuffle8( v, V128_SHUFLR32_8 )
#define v128_shufll32_8( v ) v128_shuffle8( v, V128_SHUFLL32_8 )
#define v128_ror64( v, c ) \
( (c) == 8 ) ? v128_shuflr64_8( v ) \
: ( (c) == 16 ) ? v128_shuflr64_16( v ) \
: ( (c) == 24 ) ? v128_shuflr64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shufll64_24( v ) \
: ( (c) == 48 ) ? v128_shufll64_16( v ) \
: ( (c) == 56 ) ? v128_shufll64_8( v ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_rol64( v, c ) \
( (c) == 8 ) ? v128_shufll64_8( v ) \
: ( (c) == 16 ) ? v128_shufll64_16( v ) \
: ( (c) == 24 ) ? v128_shufll64_24( v ) \
: ( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \
: ( (c) == 40 ) ? v128_shuflr64_24( v ) \
: ( (c) == 48 ) ? v128_shuflr64_16( v ) \
: ( (c) == 56 ) ? v128_shuflr64_8( v ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)(v)), 64-(c) ), \
((uint64x2_t)(v)), c )
#define v128_ror32( v, c ) \
( (c) == 8 ) ? v128_shuflr32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shufll32_8( v ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
#define v128_rol32( v, c ) \
( (c) == 8 ) ? v128_shufll32_8( v ) \
: ( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint16x8_t)(v)) ) \
: ( (c) == 24 ) ? v128_shuflr32_8( v ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)(v)), 32-(c) ), \
((uint32x4_t)(v)), c )
*/
#define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint32x4_t)(v)) ) \

View File

@@ -474,6 +474,7 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
}
/*
// ARM feature compiler flags
#ifdef __aarch64__
#warning "__aarch64__"
#endif
@@ -509,16 +510,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
#endif
*/
// Typical display format: AVX10.[version]_[vectorlength], if vector length is
// omitted 256 is the default.
// Ex: AVX10.1_512
// Flags:
// AVX10 128 256 512
// 0 0 0 0 = AVX10 not supported
// 1 1 1 0 = AVX10 256 bit max (version 2)
// 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids)
// Other combinations are not defined.
static inline bool cpu_arch_x86_64()
{
#if defined(__x86_64__)
@@ -766,6 +757,17 @@ static inline bool has_vbmi2()
#endif
}
static inline bool has_amx()
{
#if defined(__x86_64__)
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EDX_Reg ] & AMX_TILE_Flag;
#else
return false;
#endif
}
static inline bool has_aes()
{
#if defined(__x86_64__)
@@ -815,12 +817,9 @@ static inline bool has_sveaes()
static inline bool has_sha256()
{
#if defined(__x86_64__)
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 0, cpu_info );
return cpu_info[ EBX_Reg ] & SHA_Flag;
}
return false;
#elif defined(__aarch64__) && defined(HWCAP_SHA2)
// NEON SHA256
@@ -835,7 +834,7 @@ static inline bool has_sha256()
static inline bool has_sha512()
{
#if defined(__x86_64__)
if ( has_avx2() )
if ( has_avx() )
{
unsigned int cpu_info[4] = { 0 };
cpuid( EXTENDED_FEATURES, 1, cpu_info );
@@ -852,7 +851,6 @@ static inline bool has_sha512()
#endif
}
// Arm only
static inline bool has_sha3()
{
#if defined(__aarch64__) && defined(HWCAP_SHA3)
@@ -944,16 +942,6 @@ static inline int sve_vector_length()
return 0;
}
// Assume min_vlen refers to the register size
static inline int rvv_vector_length()
{
#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return __riscv_v_min_vlen;
#endif
return 0;
}
// generic
static inline int vector_length()
{
#if defined(__x86_64__)
@@ -965,8 +953,8 @@ static inline int vector_length()
return has_sve() ? sve_vector_length()
: has_neon() ? 128
: 0;
#elif defined(__riscv) && defined(__riscv_vector)
return rvv_vector_length();
#elif defined(__riscv) && defined(__riscv_vector) && defined(__riscv_v_min_vlen)
return __riscv_v_min_vlen;
#endif
return 0;
}

49
util.c
View File

@@ -304,39 +304,28 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
free(cmd);
}
void format_hashrate(double hashrate, char *output)
// Decimal SI, factors 0f 1000
void scale_hash_for_display ( double* hashrate, char* prefix )
{
char prefix = '\0';
if (hashrate < 10000) {
// nop
}
else if (hashrate < 1e7) {
prefix = 'k';
hashrate *= 1e-3;
}
else if (hashrate < 1e10) {
prefix = 'M';
hashrate *= 1e-6;
}
else if (hashrate < 1e13) {
prefix = 'G';
hashrate *= 1e-9;
}
else {
prefix = 'T';
hashrate *= 1e-12;
}
sprintf(
output,
prefix ? "%.2f %cH/s" : "%.2f H/s%c",
hashrate, prefix
);
if ( *hashrate < 1e4 ) *prefix = 0;
else if ( *hashrate < 1e7 ) { *prefix = 'k'; *hashrate /= 1e3; }
else if ( *hashrate < 1e10 ) { *prefix = 'M'; *hashrate /= 1e6; }
else if ( *hashrate < 1e13 ) { *prefix = 'G'; *hashrate /= 1e9; }
else if ( *hashrate < 1e16 ) { *prefix = 'T'; *hashrate /= 1e12; }
else if ( *hashrate < 1e19 ) { *prefix = 'P'; *hashrate /= 1e15; }
else if ( *hashrate < 1e22 ) { *prefix = 'E'; *hashrate /= 1e18; }
else if ( *hashrate < 1e25 ) { *prefix = 'Z'; *hashrate /= 1e21; }
else { *prefix = 'Y'; *hashrate /= 1e24; }
}
// For use with MiB etc
void format_hashrate( double hashrate, char *output )
{
char prefix = '\0';
scale_hash_for_display( &hashrate, &prefix );
sprintf( output, prefix ? "%.2f %cH/s" : "%.2f H/s%c", hashrate, prefix );
}
// Binary SI, factors of 1024
void format_number_si( double* n, char* si_units )
{
if ( *n < 1024*10 ) { *si_units = 0; return; }

View File

@@ -55,9 +55,9 @@ the current directory it will be created.
Data file creation can take up to 30 minutes on a spinning hard drive.
Once created the new data file will be verified and used immediately
if a valid url and user were included on the command line.
if a valid url and user was included on the command line.
A default data file can be created by ommitting the url option. That will
A default data file can also be created by ommitting the url option. That will
either verify an existing default data file or create one and verify it,
then exit.